drivers/iommu/intel/iommu.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright © 2006-2014 Intel Corporation.
   4  *
   5  * Authors: David Woodhouse <dwmw2@infradead.org>,
   6  *          Ashok Raj <ashok.raj@intel.com>,
   7  *          Shaohua Li <shaohua.li@intel.com>,
   8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9  *          Fenghua Yu <fenghua.yu@intel.com>
  10  *          Joerg Roedel <jroedel@suse.de>
  11  */
  12
  13 #define pr_fmt(fmt)     "DMAR: " fmt
  14 #define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16 #include <linux/init.h>
  17 #include <linux/bitmap.h>
  18 #include <linux/debugfs.h>
  19 #include <linux/export.h>
  20 #include <linux/slab.h>
  21 #include <linux/irq.h>
  22 #include <linux/interrupt.h>
  23 #include <linux/spinlock.h>
  24 #include <linux/pci.h>
  25 #include <linux/dmar.h>
  26 #include <linux/dma-map-ops.h>
  27 #include <linux/mempool.h>
  28 #include <linux/memory.h>
  29 #include <linux/cpu.h>
  30 #include <linux/timer.h>
  31 #include <linux/io.h>
  32 #include <linux/iova.h>
  33 #include <linux/iommu.h>
  34 #include <linux/intel-iommu.h>
  35 #include <linux/syscore_ops.h>
  36 #include <linux/tboot.h>
  37 #include <linux/dmi.h>
  38 #include <linux/pci-ats.h>
  39 #include <linux/memblock.h>
  40 #include <linux/dma-map-ops.h>
  41 #include <linux/dma-direct.h>
  42 #include <linux/crash_dump.h>
  43 #include <linux/numa.h>
  44 #include <linux/swiotlb.h>
  45 #include <asm/irq_remapping.h>
  46 #include <asm/cacheflush.h>
  47 #include <asm/iommu.h>
  48 #include <trace/events/intel_iommu.h>
  49
  50 #include "../irq_remapping.h"
  51 #include "pasid.h"
  52
  53 #define ROOT_SIZE               VTD_PAGE_SIZE
  54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  55
  56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  60
  61 #define IOAPIC_RANGE_START      (0xfee00000)
  62 #define IOAPIC_RANGE_END        (0xfeefffff)
  63 #define IOVA_START_ADDR         (0x1000)
  64
  65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  66
  67 #define MAX_AGAW_WIDTH 64
  68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  69
  70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  72
  73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  78
  79 /* IO virtual address start page frame number */
  80 #define IOVA_START_PFN          (1)
  81
  82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  83
  84 /* page table handling */
  85 #define LEVEL_STRIDE            (9)
  86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  87
  88 /*
  89  * This bitmap is used to advertise the page sizes our hardware support
  90  * to the IOMMU core, which will then use this information to split
  91  * physically contiguous memory regions it is mapping into page sizes
  92  * that we support.
  93  *
  94  * Traditionally the IOMMU core just handed us the mappings directly,
  95  * after making sure the size is an order of a 4KiB page and that the
  96  * mapping has natural alignment.
  97  *
  98  * To retain this behavior, we currently advertise that we support
  99  * all page sizes that are an order of 4KiB.
 100  *
 101  * If at some point we'd like to utilize the IOMMU core's new behavior,
 102  * we could change this to advertise the real page sizes we support.
 103  */
 104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 105
 106 static inline int agaw_to_level(int agaw)
 107 {
 108         return agaw + 2;
 109 }
 110
 111 static inline int agaw_to_width(int agaw)
 112 {
 113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 114 }
 115
 116 static inline int width_to_agaw(int width)
 117 {
 118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 119 }
 120
 121 static inline unsigned int level_to_offset_bits(int level)
 122 {
 123         return (level - 1) * LEVEL_STRIDE;
 124 }
 125
 126 static inline int pfn_level_offset(u64 pfn, int level)
 127 {
 128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 129 }
 130
 131 static inline u64 level_mask(int level)
 132 {
 133         return -1ULL << level_to_offset_bits(level);
 134 }
 135
 136 static inline u64 level_size(int level)
 137 {
 138         return 1ULL << level_to_offset_bits(level);
 139 }
 140
 141 static inline u64 align_to_level(u64 pfn, int level)
 142 {
 143         return (pfn + level_size(level) - 1) & level_mask(level);
 144 }
 145
 146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 147 {
 148         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 149 }
 150
 151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 152    are never going to work. */
 153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 154 {
 155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 156 }
 157
 158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 159 {
 160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 161 }
 162 static inline unsigned long page_to_dma_pfn(struct page *pg)
 163 {
 164         return mm_to_dma_pfn(page_to_pfn(pg));
 165 }
 166 static inline unsigned long virt_to_dma_pfn(void *p)
 167 {
 168         return page_to_dma_pfn(virt_to_page(p));
 169 }
 170
 171 /* global iommu list, set NULL for ignored DMAR units */
 172 static struct intel_iommu **g_iommus;
 173
 174 static void __init check_tylersburg_isoch(void);
 175 static int rwbf_quirk;
 176
 177 /*
 178  * set to 1 to panic kernel if can't successfully enable VT-d
 179  * (used when kernel is launched w/ TXT)
 180  */
 181 static int force_on = 0;
 182 int intel_iommu_tboot_noforce;
 183 static int no_platform_optin;
 184
 185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 186
 187 /*
 188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 189  * if marked present.
 190  */
 191 static phys_addr_t root_entry_lctp(struct root_entry *re)
 192 {
 193         if (!(re->lo & 1))
 194                 return 0;
 195
 196         return re->lo & VTD_PAGE_MASK;
 197 }
 198
 199 /*
 200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 201  * if marked present.
 202  */
 203 static phys_addr_t root_entry_uctp(struct root_entry *re)
 204 {
 205         if (!(re->hi & 1))
 206                 return 0;
 207
 208         return re->hi & VTD_PAGE_MASK;
 209 }
 210
 211 static inline void context_clear_pasid_enable(struct context_entry *context)
 212 {
 213         context->lo &= ~(1ULL << 11);
 214 }
 215
 216 static inline bool context_pasid_enabled(struct context_entry *context)
 217 {
 218         return !!(context->lo & (1ULL << 11));
 219 }
 220
 221 static inline void context_set_copied(struct context_entry *context)
 222 {
 223         context->hi |= (1ull << 3);
 224 }
 225
 226 static inline bool context_copied(struct context_entry *context)
 227 {
 228         return !!(context->hi & (1ULL << 3));
 229 }
 230
 231 static inline bool __context_present(struct context_entry *context)
 232 {
 233         return (context->lo & 1);
 234 }
 235
 236 bool context_present(struct context_entry *context)
 237 {
 238         return context_pasid_enabled(context) ?
 239              __context_present(context) :
 240              __context_present(context) && !context_copied(context);
 241 }
 242
 243 static inline void context_set_present(struct context_entry *context)
 244 {
 245         context->lo |= 1;
 246 }
 247
 248 static inline void context_set_fault_enable(struct context_entry *context)
 249 {
 250         context->lo &= (((u64)-1) << 2) | 1;
 251 }
 252
 253 static inline void context_set_translation_type(struct context_entry *context,
 254                                                 unsigned long value)
 255 {
 256         context->lo &= (((u64)-1) << 4) | 3;
 257         context->lo |= (value & 3) << 2;
 258 }
 259
 260 static inline void context_set_address_root(struct context_entry *context,
 261                                             unsigned long value)
 262 {
 263         context->lo &= ~VTD_PAGE_MASK;
 264         context->lo |= value & VTD_PAGE_MASK;
 265 }
 266
 267 static inline void context_set_address_width(struct context_entry *context,
 268                                              unsigned long value)
 269 {
 270         context->hi |= value & 7;
 271 }
 272
 273 static inline void context_set_domain_id(struct context_entry *context,
 274                                          unsigned long value)
 275 {
 276         context->hi |= (value & ((1 << 16) - 1)) << 8;
 277 }
 278
 279 static inline int context_domain_id(struct context_entry *c)
 280 {
 281         return((c->hi >> 8) & 0xffff);
 282 }
 283
 284 static inline void context_clear_entry(struct context_entry *context)
 285 {
 286         context->lo = 0;
 287         context->hi = 0;
 288 }
 289
 290 /*
 291  * This domain is a statically identity mapping domain.
 292  *      1. This domain creats a static 1:1 mapping to all usable memory.
 293  *      2. It maps to each iommu if successful.
 294  *      3. Each iommu mapps to this domain if successful.
 295  */
 296 static struct dmar_domain *si_domain;
 297 static int hw_pass_through = 1;
 298
 299 #define for_each_domain_iommu(idx, domain)                      \
 300         for (idx = 0; idx < g_num_of_iommus; idx++)             \
 301                 if (domain->iommu_refcnt[idx])
 302
 303 struct dmar_rmrr_unit {
 304         struct list_head list;          /* list of rmrr units   */
 305         struct acpi_dmar_header *hdr;   /* ACPI header          */
 306         u64     base_address;           /* reserved base address*/
 307         u64     end_address;            /* reserved end address */
 308         struct dmar_dev_scope *devices; /* target devices */
 309         int     devices_cnt;            /* target device count */
 310 };
 311
 312 struct dmar_atsr_unit {
 313         struct list_head list;          /* list of ATSR units */
 314         struct acpi_dmar_header *hdr;   /* ACPI header */
 315         struct dmar_dev_scope *devices; /* target devices */
 316         int devices_cnt;                /* target device count */
 317         u8 include_all:1;               /* include all ports */
 318 };
 319
 320 static LIST_HEAD(dmar_atsr_units);
 321 static LIST_HEAD(dmar_rmrr_units);
 322
 323 #define for_each_rmrr_units(rmrr) \
 324         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 325
 326 /* bitmap for indexing intel_iommus */
 327 static int g_num_of_iommus;
 328
 329 static void domain_exit(struct dmar_domain *domain);
 330 static void domain_remove_dev_info(struct dmar_domain *domain);
 331 static void dmar_remove_one_dev_info(struct device *dev);
 332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 333 static int intel_iommu_attach_device(struct iommu_domain *domain,
 334                                      struct device *dev);
 335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 336                                             dma_addr_t iova);
 337
 338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 339 int dmar_disabled = 0;
 340 #else
 341 int dmar_disabled = 1;
 342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
 343
 344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
 345 int intel_iommu_sm = 1;
 346 #else
 347 int intel_iommu_sm;
 348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
 349
 350 int intel_iommu_enabled = 0;
 351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 352
 353 static int dmar_map_gfx = 1;
 354 static int dmar_forcedac;
 355 static int intel_iommu_strict;
 356 static int intel_iommu_superpage = 1;
 357 static int iommu_identity_mapping;
 358 static int intel_no_bounce;
 359 static int iommu_skip_te_disable;
 360
 361 #define IDENTMAP_GFX            2
 362 #define IDENTMAP_AZALIA         4
 363
 364 int intel_iommu_gfx_mapped;
 365 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 366
 367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
 368 struct device_domain_info *get_domain_info(struct device *dev)
 369 {
 370         struct device_domain_info *info;
 371
 372         if (!dev)
 373                 return NULL;
 374
 375         info = dev_iommu_priv_get(dev);
 376         if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
 377                 return NULL;
 378
 379         return info;
 380 }
 381
 382 DEFINE_SPINLOCK(device_domain_lock);
 383 static LIST_HEAD(device_domain_list);
 384
 385 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
 386                                 to_pci_dev(d)->untrusted)
 387
 388 /*
 389  * Iterate over elements in device_domain_list and call the specified
 390  * callback @fn against each element.
 391  */
 392 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 393                                      void *data), void *data)
 394 {
 395         int ret = 0;
 396         unsigned long flags;
 397         struct device_domain_info *info;
 398
 399         spin_lock_irqsave(&device_domain_lock, flags);
 400         list_for_each_entry(info, &device_domain_list, global) {
 401                 ret = fn(info, data);
 402                 if (ret) {
 403                         spin_unlock_irqrestore(&device_domain_lock, flags);
 404                         return ret;
 405                 }
 406         }
 407         spin_unlock_irqrestore(&device_domain_lock, flags);
 408
 409         return 0;
 410 }
 411
 412 const struct iommu_ops intel_iommu_ops;
 413
 414 static bool translation_pre_enabled(struct intel_iommu *iommu)
 415 {
 416         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 417 }
 418
 419 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 420 {
 421         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 422 }
 423
 424 static void init_translation_status(struct intel_iommu *iommu)
 425 {
 426         u32 gsts;
 427
 428         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 429         if (gsts & DMA_GSTS_TES)
 430                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 431 }
 432
 433 static int __init intel_iommu_setup(char *str)
 434 {
 435         if (!str)
 436                 return -EINVAL;
 437         while (*str) {
 438                 if (!strncmp(str, "on", 2)) {
 439                         dmar_disabled = 0;
 440                         pr_info("IOMMU enabled\n");
 441                 } else if (!strncmp(str, "off", 3)) {
 442                         dmar_disabled = 1;
 443                         no_platform_optin = 1;
 444                         pr_info("IOMMU disabled\n");
 445                 } else if (!strncmp(str, "igfx_off", 8)) {
 446                         dmar_map_gfx = 0;
 447                         pr_info("Disable GFX device mapping\n");
 448                 } else if (!strncmp(str, "forcedac", 8)) {
 449                         pr_info("Forcing DAC for PCI devices\n");
 450                         dmar_forcedac = 1;
 451                 } else if (!strncmp(str, "strict", 6)) {
 452                         pr_info("Disable batched IOTLB flush\n");
 453                         intel_iommu_strict = 1;
 454                 } else if (!strncmp(str, "sp_off", 6)) {
 455                         pr_info("Disable supported super page\n");
 456                         intel_iommu_superpage = 0;
 457                 } else if (!strncmp(str, "sm_on", 5)) {
 458                         pr_info("Intel-IOMMU: scalable mode supported\n");
 459                         intel_iommu_sm = 1;
 460                 } else if (!strncmp(str, "tboot_noforce", 13)) {
 461                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 462                         intel_iommu_tboot_noforce = 1;
 463                 } else if (!strncmp(str, "nobounce", 8)) {
 464                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
 465                         intel_no_bounce = 1;
 466                 }
 467
 468                 str += strcspn(str, ",");
 469                 while (*str == ',')
 470                         str++;
 471         }
 472         return 0;
 473 }
 474 __setup("intel_iommu=", intel_iommu_setup);
 475
 476 static struct kmem_cache *iommu_domain_cache;
 477 static struct kmem_cache *iommu_devinfo_cache;
 478
 479 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 480 {
 481         struct dmar_domain **domains;
 482         int idx = did >> 8;
 483
 484         domains = iommu->domains[idx];
 485         if (!domains)
 486                 return NULL;
 487
 488         return domains[did & 0xff];
 489 }
 490
 491 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 492                              struct dmar_domain *domain)
 493 {
 494         struct dmar_domain **domains;
 495         int idx = did >> 8;
 496
 497         if (!iommu->domains[idx]) {
 498                 size_t size = 256 * sizeof(struct dmar_domain *);
 499                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 500         }
 501
 502         domains = iommu->domains[idx];
 503         if (WARN_ON(!domains))
 504                 return;
 505         else
 506                 domains[did & 0xff] = domain;
 507 }
 508
 509 void *alloc_pgtable_page(int node)
 510 {
 511         struct page *page;
 512         void *vaddr = NULL;
 513
 514         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 515         if (page)
 516                 vaddr = page_address(page);
 517         return vaddr;
 518 }
 519
 520 void free_pgtable_page(void *vaddr)
 521 {
 522         free_page((unsigned long)vaddr);
 523 }
 524
 525 static inline void *alloc_domain_mem(void)
 526 {
 527         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 528 }
 529
 530 static void free_domain_mem(void *vaddr)
 531 {
 532         kmem_cache_free(iommu_domain_cache, vaddr);
 533 }
 534
 535 static inline void * alloc_devinfo_mem(void)
 536 {
 537         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 538 }
 539
 540 static inline void free_devinfo_mem(void *vaddr)
 541 {
 542         kmem_cache_free(iommu_devinfo_cache, vaddr);
 543 }
 544
 545 static inline int domain_type_is_si(struct dmar_domain *domain)
 546 {
 547         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 548 }
 549
 550 static inline bool domain_use_first_level(struct dmar_domain *domain)
 551 {
 552         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
 553 }
 554
 555 static inline int domain_pfn_supported(struct dmar_domain *domain,
 556                                        unsigned long pfn)
 557 {
 558         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 559
 560         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 561 }
 562
 563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 564 {
 565         unsigned long sagaw;
 566         int agaw = -1;
 567
 568         sagaw = cap_sagaw(iommu->cap);
 569         for (agaw = width_to_agaw(max_gaw);
 570              agaw >= 0; agaw--) {
 571                 if (test_bit(agaw, &sagaw))
 572                         break;
 573         }
 574
 575         return agaw;
 576 }
 577
 578 /*
 579  * Calculate max SAGAW for each iommu.
 580  */
 581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 582 {
 583         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 584 }
 585
 586 /*
 587  * calculate agaw for each iommu.
 588  * "SAGAW" may be different across iommus, use a default agaw, and
 589  * get a supported less agaw for iommus that don't support the default agaw.
 590  */
 591 int iommu_calculate_agaw(struct intel_iommu *iommu)
 592 {
 593         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 594 }
 595
 596 /* This functionin only returns single iommu in a domain */
 597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 598 {
 599         int iommu_id;
 600
 601         /* si_domain and vm domain should not get here. */
 602         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
 603                 return NULL;
 604
 605         for_each_domain_iommu(iommu_id, domain)
 606                 break;
 607
 608         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 609                 return NULL;
 610
 611         return g_iommus[iommu_id];
 612 }
 613
 614 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 615 {
 616         return sm_supported(iommu) ?
 617                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 618 }
 619
 620 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 621 {
 622         struct dmar_drhd_unit *drhd;
 623         struct intel_iommu *iommu;
 624         bool found = false;
 625         int i;
 626
 627         domain->iommu_coherency = 1;
 628
 629         for_each_domain_iommu(i, domain) {
 630                 found = true;
 631                 if (!iommu_paging_structure_coherency(g_iommus[i])) {
 632                         domain->iommu_coherency = 0;
 633                         break;
 634                 }
 635         }
 636         if (found)
 637                 return;
 638
 639         /* No hardware attached; use lowest common denominator */
 640         rcu_read_lock();
 641         for_each_active_iommu(iommu, drhd) {
 642                 if (!iommu_paging_structure_coherency(iommu)) {
 643                         domain->iommu_coherency = 0;
 644                         break;
 645                 }
 646         }
 647         rcu_read_unlock();
 648 }
 649
 650 static int domain_update_iommu_snooping(struct intel_iommu *skip)
 651 {
 652         struct dmar_drhd_unit *drhd;
 653         struct intel_iommu *iommu;
 654         int ret = 1;
 655
 656         rcu_read_lock();
 657         for_each_active_iommu(iommu, drhd) {
 658                 if (iommu != skip) {
 659                         if (!ecap_sc_support(iommu->ecap)) {
 660                                 ret = 0;
 661                                 break;
 662                         }
 663                 }
 664         }
 665         rcu_read_unlock();
 666
 667         return ret;
 668 }
 669
 670 static int domain_update_iommu_superpage(struct dmar_domain *domain,
 671                                          struct intel_iommu *skip)
 672 {
 673         struct dmar_drhd_unit *drhd;
 674         struct intel_iommu *iommu;
 675         int mask = 0x3;
 676
 677         if (!intel_iommu_superpage) {
 678                 return 0;
 679         }
 680
 681         /* set iommu_superpage to the smallest common denominator */
 682         rcu_read_lock();
 683         for_each_active_iommu(iommu, drhd) {
 684                 if (iommu != skip) {
 685                         if (domain && domain_use_first_level(domain)) {
 686                                 if (!cap_fl1gp_support(iommu->cap))
 687                                         mask = 0x1;
 688                         } else {
 689                                 mask &= cap_super_page_val(iommu->cap);
 690                         }
 691
 692                         if (!mask)
 693                                 break;
 694                 }
 695         }
 696         rcu_read_unlock();
 697
 698         return fls(mask);
 699 }
 700
 701 static int domain_update_device_node(struct dmar_domain *domain)
 702 {
 703         struct device_domain_info *info;
 704         int nid = NUMA_NO_NODE;
 705
 706         assert_spin_locked(&device_domain_lock);
 707
 708         if (list_empty(&domain->devices))
 709                 return NUMA_NO_NODE;
 710
 711         list_for_each_entry(info, &domain->devices, link) {
 712                 if (!info->dev)
 713                         continue;
 714
 715                 /*
 716                  * There could possibly be multiple device numa nodes as devices
 717                  * within the same domain may sit behind different IOMMUs. There
 718                  * isn't perfect answer in such situation, so we select first
 719                  * come first served policy.
 720                  */
 721                 nid = dev_to_node(info->dev);
 722                 if (nid != NUMA_NO_NODE)
 723                         break;
 724         }
 725
 726         return nid;
 727 }
 728
 729 /* Some capabilities may be different across iommus */
 730 static void domain_update_iommu_cap(struct dmar_domain *domain)
 731 {
 732         domain_update_iommu_coherency(domain);
 733         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 734         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 735
 736         /*
 737          * If RHSA is missing, we should default to the device numa domain
 738          * as fall back.
 739          */
 740         if (domain->nid == NUMA_NO_NODE)
 741                 domain->nid = domain_update_device_node(domain);
 742 }
 743
 744 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 745                                          u8 devfn, int alloc)
 746 {
 747         struct root_entry *root = &iommu->root_entry[bus];
 748         struct context_entry *context;
 749         u64 *entry;
 750
 751         entry = &root->lo;
 752         if (sm_supported(iommu)) {
 753                 if (devfn >= 0x80) {
 754                         devfn -= 0x80;
 755                         entry = &root->hi;
 756                 }
 757                 devfn *= 2;
 758         }
 759         if (*entry & 1)
 760                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 761         else {
 762                 unsigned long phy_addr;
 763                 if (!alloc)
 764                         return NULL;
 765
 766                 context = alloc_pgtable_page(iommu->node);
 767                 if (!context)
 768                         return NULL;
 769
 770                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 771                 phy_addr = virt_to_phys((void *)context);
 772                 *entry = phy_addr | 1;
 773                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 774         }
 775         return &context[devfn];
 776 }
 777
 778 static bool attach_deferred(struct device *dev)
 779 {
 780         return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
 781 }
 782
 783 /**
 784  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 785  *                               sub-hierarchy of a candidate PCI-PCI bridge
 786  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 787  * @bridge: the candidate PCI-PCI bridge
 788  *
 789  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 790  */
 791 static bool
 792 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 793 {
 794         struct pci_dev *pdev, *pbridge;
 795
 796         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 797                 return false;
 798
 799         pdev = to_pci_dev(dev);
 800         pbridge = to_pci_dev(bridge);
 801
 802         if (pbridge->subordinate &&
 803             pbridge->subordinate->number <= pdev->bus->number &&
 804             pbridge->subordinate->busn_res.end >= pdev->bus->number)
 805                 return true;
 806
 807         return false;
 808 }
 809
 810 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
 811 {
 812         struct dmar_drhd_unit *drhd;
 813         u32 vtbar;
 814         int rc;
 815
 816         /* We know that this device on this chipset has its own IOMMU.
 817          * If we find it under a different IOMMU, then the BIOS is lying
 818          * to us. Hope that the IOMMU for this device is actually
 819          * disabled, and it needs no translation...
 820          */
 821         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
 822         if (rc) {
 823                 /* "can't" happen */
 824                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
 825                 return false;
 826         }
 827         vtbar &= 0xffff0000;
 828
 829         /* we know that the this iommu should be at offset 0xa000 from vtbar */
 830         drhd = dmar_find_matched_drhd_unit(pdev);
 831         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
 832                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
 833                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 834                 return true;
 835         }
 836
 837         return false;
 838 }
 839
 840 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
 841 {
 842         if (!iommu || iommu->drhd->ignored)
 843                 return true;
 844
 845         if (dev_is_pci(dev)) {
 846                 struct pci_dev *pdev = to_pci_dev(dev);
 847
 848                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 849                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
 850                     quirk_ioat_snb_local_iommu(pdev))
 851                         return true;
 852         }
 853
 854         return false;
 855 }
 856
 857 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 858 {
 859         struct dmar_drhd_unit *drhd = NULL;
 860         struct pci_dev *pdev = NULL;
 861         struct intel_iommu *iommu;
 862         struct device *tmp;
 863         u16 segment = 0;
 864         int i;
 865
 866         if (!dev)
 867                 return NULL;
 868
 869         if (dev_is_pci(dev)) {
 870                 struct pci_dev *pf_pdev;
 871
 872                 pdev = pci_real_dma_dev(to_pci_dev(dev));
 873
 874                 /* VFs aren't listed in scope tables; we need to look up
 875                  * the PF instead to find the IOMMU. */
 876                 pf_pdev = pci_physfn(pdev);
 877                 dev = &pf_pdev->dev;
 878                 segment = pci_domain_nr(pdev->bus);
 879         } else if (has_acpi_companion(dev))
 880                 dev = &ACPI_COMPANION(dev)->dev;
 881
 882         rcu_read_lock();
 883         for_each_iommu(iommu, drhd) {
 884                 if (pdev && segment != drhd->segment)
 885                         continue;
 886
 887                 for_each_active_dev_scope(drhd->devices,
 888                                           drhd->devices_cnt, i, tmp) {
 889                         if (tmp == dev) {
 890                                 /* For a VF use its original BDF# not that of the PF
 891                                  * which we used for the IOMMU lookup. Strictly speaking
 892                                  * we could do this for all PCI devices; we only need to
 893                                  * get the BDF# from the scope table for ACPI matches. */
 894                                 if (pdev && pdev->is_virtfn)
 895                                         goto got_pdev;
 896
 897                                 if (bus && devfn) {
 898                                         *bus = drhd->devices[i].bus;
 899                                         *devfn = drhd->devices[i].devfn;
 900                                 }
 901                                 goto out;
 902                         }
 903
 904                         if (is_downstream_to_pci_bridge(dev, tmp))
 905                                 goto got_pdev;
 906                 }
 907
 908                 if (pdev && drhd->include_all) {
 909                 got_pdev:
 910                         if (bus && devfn) {
 911                                 *bus = pdev->bus->number;
 912                                 *devfn = pdev->devfn;
 913                         }
 914                         goto out;
 915                 }
 916         }
 917         iommu = NULL;
 918  out:
 919         if (iommu_is_dummy(iommu, dev))
 920                 iommu = NULL;
 921
 922         rcu_read_unlock();
 923
 924         return iommu;
 925 }
 926
 927 static void domain_flush_cache(struct dmar_domain *domain,
 928                                void *addr, int size)
 929 {
 930         if (!domain->iommu_coherency)
 931                 clflush_cache_range(addr, size);
 932 }
 933
 934 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 935 {
 936         struct context_entry *context;
 937         int ret = 0;
 938         unsigned long flags;
 939
 940         spin_lock_irqsave(&iommu->lock, flags);
 941         context = iommu_context_addr(iommu, bus, devfn, 0);
 942         if (context)
 943                 ret = context_present(context);
 944         spin_unlock_irqrestore(&iommu->lock, flags);
 945         return ret;
 946 }
 947
 948 static void free_context_table(struct intel_iommu *iommu)
 949 {
 950         int i;
 951         unsigned long flags;
 952         struct context_entry *context;
 953
 954         spin_lock_irqsave(&iommu->lock, flags);
 955         if (!iommu->root_entry) {
 956                 goto out;
 957         }
 958         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 959                 context = iommu_context_addr(iommu, i, 0, 0);
 960                 if (context)
 961                         free_pgtable_page(context);
 962
 963                 if (!sm_supported(iommu))
 964                         continue;
 965
 966                 context = iommu_context_addr(iommu, i, 0x80, 0);
 967                 if (context)
 968                         free_pgtable_page(context);
 969
 970         }
 971         free_pgtable_page(iommu->root_entry);
 972         iommu->root_entry = NULL;
 973 out:
 974         spin_unlock_irqrestore(&iommu->lock, flags);
 975 }
 976
 977 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 978                                       unsigned long pfn, int *target_level)
 979 {
 980         struct dma_pte *parent, *pte;
 981         int level = agaw_to_level(domain->agaw);
 982         int offset;
 983
 984         BUG_ON(!domain->pgd);
 985
 986         if (!domain_pfn_supported(domain, pfn))
 987                 /* Address beyond IOMMU's addressing capabilities. */
 988                 return NULL;
 989
 990         parent = domain->pgd;
 991
 992         while (1) {
 993                 void *tmp_page;
 994
 995                 offset = pfn_level_offset(pfn, level);
 996                 pte = &parent[offset];
 997                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 998                         break;
 999                 if (level == *target_level)
1000                         break;
1001
1002                 if (!dma_pte_present(pte)) {
1003                         uint64_t pteval;
1004
1005                         tmp_page = alloc_pgtable_page(domain->nid);
1006
1007                         if (!tmp_page)
1008                                 return NULL;
1009
1010                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1011                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1012                         if (domain_use_first_level(domain))
1013                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1014                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1015                                 /* Someone else set it while we were thinking; use theirs. */
1016                                 free_pgtable_page(tmp_page);
1017                         else
1018                                 domain_flush_cache(domain, pte, sizeof(*pte));
1019                 }
1020                 if (level == 1)
1021                         break;
1022
1023                 parent = phys_to_virt(dma_pte_addr(pte));
1024                 level--;
1025         }
1026
1027         if (!*target_level)
1028                 *target_level = level;
1029
1030         return pte;
1031 }
1032
1033 /* return address's pte at specific level */
1034 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1035                                          unsigned long pfn,
1036                                          int level, int *large_page)
1037 {
1038         struct dma_pte *parent, *pte;
1039         int total = agaw_to_level(domain->agaw);
1040         int offset;
1041
1042         parent = domain->pgd;
1043         while (level <= total) {
1044                 offset = pfn_level_offset(pfn, total);
1045                 pte = &parent[offset];
1046                 if (level == total)
1047                         return pte;
1048
1049                 if (!dma_pte_present(pte)) {
1050                         *large_page = total;
1051                         break;
1052                 }
1053
1054                 if (dma_pte_superpage(pte)) {
1055                         *large_page = total;
1056                         return pte;
1057                 }
1058
1059                 parent = phys_to_virt(dma_pte_addr(pte));
1060                 total--;
1061         }
1062         return NULL;
1063 }
1064
1065 /* clear last level pte, a tlb flush should be followed */
1066 static void dma_pte_clear_range(struct dmar_domain *domain,
1067                                 unsigned long start_pfn,
1068                                 unsigned long last_pfn)
1069 {
1070         unsigned int large_page;
1071         struct dma_pte *first_pte, *pte;
1072
1073         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1074         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1075         BUG_ON(start_pfn > last_pfn);
1076
1077         /* we don't need lock here; nobody else touches the iova range */
1078         do {
1079                 large_page = 1;
1080                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1081                 if (!pte) {
1082                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1083                         continue;
1084                 }
1085                 do {
1086                         dma_clear_pte(pte);
1087                         start_pfn += lvl_to_nr_pages(large_page);
1088                         pte++;
1089                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1090
1091                 domain_flush_cache(domain, first_pte,
1092                                    (void *)pte - (void *)first_pte);
1093
1094         } while (start_pfn && start_pfn <= last_pfn);
1095 }
1096
1097 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1098                                int retain_level, struct dma_pte *pte,
1099                                unsigned long pfn, unsigned long start_pfn,
1100                                unsigned long last_pfn)
1101 {
1102         pfn = max(start_pfn, pfn);
1103         pte = &pte[pfn_level_offset(pfn, level)];
1104
1105         do {
1106                 unsigned long level_pfn;
1107                 struct dma_pte *level_pte;
1108
1109                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1110                         goto next;
1111
1112                 level_pfn = pfn & level_mask(level);
1113                 level_pte = phys_to_virt(dma_pte_addr(pte));
1114
1115                 if (level > 2) {
1116                         dma_pte_free_level(domain, level - 1, retain_level,
1117                                            level_pte, level_pfn, start_pfn,
1118                                            last_pfn);
1119                 }
1120
1121                 /*
1122                  * Free the page table if we're below the level we want to
1123                  * retain and the range covers the entire table.
1124                  */
1125                 if (level < retain_level && !(start_pfn > level_pfn ||
1126                       last_pfn < level_pfn + level_size(level) - 1)) {
1127                         dma_clear_pte(pte);
1128                         domain_flush_cache(domain, pte, sizeof(*pte));
1129                         free_pgtable_page(level_pte);
1130                 }
1131 next:
1132                 pfn += level_size(level);
1133         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1134 }
1135
1136 /*
1137  * clear last level (leaf) ptes and free page table pages below the
1138  * level we wish to keep intact.
1139  */
1140 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1141                                    unsigned long start_pfn,
1142                                    unsigned long last_pfn,
1143                                    int retain_level)
1144 {
1145         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1146         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1147         BUG_ON(start_pfn > last_pfn);
1148
1149         dma_pte_clear_range(domain, start_pfn, last_pfn);
1150
1151         /* We don't need lock here; nobody else touches the iova range */
1152         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1153                            domain->pgd, 0, start_pfn, last_pfn);
1154
1155         /* free pgd */
1156         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1157                 free_pgtable_page(domain->pgd);
1158                 domain->pgd = NULL;
1159         }
1160 }
1161
1162 /* When a page at a given level is being unlinked from its parent, we don't
1163    need to *modify* it at all. All we need to do is make a list of all the
1164    pages which can be freed just as soon as we've flushed the IOTLB and we
1165    know the hardware page-walk will no longer touch them.
1166    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1167    be freed. */
1168 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1169                                             int level, struct dma_pte *pte,
1170                                             struct page *freelist)
1171 {
1172         struct page *pg;
1173
1174         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1175         pg->freelist = freelist;
1176         freelist = pg;
1177
1178         if (level == 1)
1179                 return freelist;
1180
1181         pte = page_address(pg);
1182         do {
1183                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1184                         freelist = dma_pte_list_pagetables(domain, level - 1,
1185                                                            pte, freelist);
1186                 pte++;
1187         } while (!first_pte_in_page(pte));
1188
1189         return freelist;
1190 }
1191
1192 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1193                                         struct dma_pte *pte, unsigned long pfn,
1194                                         unsigned long start_pfn,
1195                                         unsigned long last_pfn,
1196                                         struct page *freelist)
1197 {
1198         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1199
1200         pfn = max(start_pfn, pfn);
1201         pte = &pte[pfn_level_offset(pfn, level)];
1202
1203         do {
1204                 unsigned long level_pfn;
1205
1206                 if (!dma_pte_present(pte))
1207                         goto next;
1208
1209                 level_pfn = pfn & level_mask(level);
1210
1211                 /* If range covers entire pagetable, free it */
1212                 if (start_pfn <= level_pfn &&
1213                     last_pfn >= level_pfn + level_size(level) - 1) {
1214                         /* These suborbinate page tables are going away entirely. Don't
1215                            bother to clear them; we're just going to *free* them. */
1216                         if (level > 1 && !dma_pte_superpage(pte))
1217                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1218
1219                         dma_clear_pte(pte);
1220                         if (!first_pte)
1221                                 first_pte = pte;
1222                         last_pte = pte;
1223                 } else if (level > 1) {
1224                         /* Recurse down into a level that isn't *entirely* obsolete */
1225                         freelist = dma_pte_clear_level(domain, level - 1,
1226                                                        phys_to_virt(dma_pte_addr(pte)),
1227                                                        level_pfn, start_pfn, last_pfn,
1228                                                        freelist);
1229                 }
1230 next:
1231                 pfn += level_size(level);
1232         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1233
1234         if (first_pte)
1235                 domain_flush_cache(domain, first_pte,
1236                                    (void *)++last_pte - (void *)first_pte);
1237
1238         return freelist;
1239 }
1240
1241 /* We can't just free the pages because the IOMMU may still be walking
1242    the page tables, and may have cached the intermediate levels. The
1243    pages can only be freed after the IOTLB flush has been done. */
1244 static struct page *domain_unmap(struct dmar_domain *domain,
1245                                  unsigned long start_pfn,
1246                                  unsigned long last_pfn)
1247 {
1248         struct page *freelist;
1249
1250         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1251         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1252         BUG_ON(start_pfn > last_pfn);
1253
1254         /* we don't need lock here; nobody else touches the iova range */
1255         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1256                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1257
1258         /* free pgd */
1259         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1260                 struct page *pgd_page = virt_to_page(domain->pgd);
1261                 pgd_page->freelist = freelist;
1262                 freelist = pgd_page;
1263
1264                 domain->pgd = NULL;
1265         }
1266
1267         return freelist;
1268 }
1269
1270 static void dma_free_pagelist(struct page *freelist)
1271 {
1272         struct page *pg;
1273
1274         while ((pg = freelist)) {
1275                 freelist = pg->freelist;
1276                 free_pgtable_page(page_address(pg));
1277         }
1278 }
1279
1280 static void iova_entry_free(unsigned long data)
1281 {
1282         struct page *freelist = (struct page *)data;
1283
1284         dma_free_pagelist(freelist);
1285 }
1286
1287 /* iommu handling */
1288 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1289 {
1290         struct root_entry *root;
1291         unsigned long flags;
1292
1293         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1294         if (!root) {
1295                 pr_err("Allocating root entry for %s failed\n",
1296                         iommu->name);
1297                 return -ENOMEM;
1298         }
1299
1300         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1301
1302         spin_lock_irqsave(&iommu->lock, flags);
1303         iommu->root_entry = root;
1304         spin_unlock_irqrestore(&iommu->lock, flags);
1305
1306         return 0;
1307 }
1308
1309 static void iommu_set_root_entry(struct intel_iommu *iommu)
1310 {
1311         u64 addr;
1312         u32 sts;
1313         unsigned long flag;
1314
1315         addr = virt_to_phys(iommu->root_entry);
1316         if (sm_supported(iommu))
1317                 addr |= DMA_RTADDR_SMT;
1318
1319         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1320         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1321
1322         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1323
1324         /* Make sure hardware complete it */
1325         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1326                       readl, (sts & DMA_GSTS_RTPS), sts);
1327
1328         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1329 }
1330
1331 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1332 {
1333         u32 val;
1334         unsigned long flag;
1335
1336         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1337                 return;
1338
1339         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1340         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1341
1342         /* Make sure hardware complete it */
1343         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1344                       readl, (!(val & DMA_GSTS_WBFS)), val);
1345
1346         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1347 }
1348
1349 /* return value determine if we need a write buffer flush */
1350 static void __iommu_flush_context(struct intel_iommu *iommu,
1351                                   u16 did, u16 source_id, u8 function_mask,
1352                                   u64 type)
1353 {
1354         u64 val = 0;
1355         unsigned long flag;
1356
1357         switch (type) {
1358         case DMA_CCMD_GLOBAL_INVL:
1359                 val = DMA_CCMD_GLOBAL_INVL;
1360                 break;
1361         case DMA_CCMD_DOMAIN_INVL:
1362                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1363                 break;
1364         case DMA_CCMD_DEVICE_INVL:
1365                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1366                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1367                 break;
1368         default:
1369                 BUG();
1370         }
1371         val |= DMA_CCMD_ICC;
1372
1373         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1374         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1375
1376         /* Make sure hardware complete it */
1377         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1378                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1379
1380         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1381 }
1382
1383 /* return value determine if we need a write buffer flush */
1384 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1385                                 u64 addr, unsigned int size_order, u64 type)
1386 {
1387         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1388         u64 val = 0, val_iva = 0;
1389         unsigned long flag;
1390
1391         switch (type) {
1392         case DMA_TLB_GLOBAL_FLUSH:
1393                 /* global flush doesn't need set IVA_REG */
1394                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1395                 break;
1396         case DMA_TLB_DSI_FLUSH:
1397                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1398                 break;
1399         case DMA_TLB_PSI_FLUSH:
1400                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1401                 /* IH bit is passed in as part of address */
1402                 val_iva = size_order | addr;
1403                 break;
1404         default:
1405                 BUG();
1406         }
1407         /* Note: set drain read/write */
1408 #if 0
1409         /*
1410          * This is probably to be super secure.. Looks like we can
1411          * ignore it without any impact.
1412          */
1413         if (cap_read_drain(iommu->cap))
1414                 val |= DMA_TLB_READ_DRAIN;
1415 #endif
1416         if (cap_write_drain(iommu->cap))
1417                 val |= DMA_TLB_WRITE_DRAIN;
1418
1419         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1420         /* Note: Only uses first TLB reg currently */
1421         if (val_iva)
1422                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1423         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1424
1425         /* Make sure hardware complete it */
1426         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1427                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1428
1429         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1430
1431         /* check IOTLB invalidation granularity */
1432         if (DMA_TLB_IAIG(val) == 0)
1433                 pr_err("Flush IOTLB failed\n");
1434         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1435                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1436                         (unsigned long long)DMA_TLB_IIRG(type),
1437                         (unsigned long long)DMA_TLB_IAIG(val));
1438 }
1439
1440 static struct device_domain_info *
1441 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1442                          u8 bus, u8 devfn)
1443 {
1444         struct device_domain_info *info;
1445
1446         assert_spin_locked(&device_domain_lock);
1447
1448         if (!iommu->qi)
1449                 return NULL;
1450
1451         list_for_each_entry(info, &domain->devices, link)
1452                 if (info->iommu == iommu && info->bus == bus &&
1453                     info->devfn == devfn) {
1454                         if (info->ats_supported && info->dev)
1455                                 return info;
1456                         break;
1457                 }
1458
1459         return NULL;
1460 }
1461
1462 static void domain_update_iotlb(struct dmar_domain *domain)
1463 {
1464         struct device_domain_info *info;
1465         bool has_iotlb_device = false;
1466
1467         assert_spin_locked(&device_domain_lock);
1468
1469         list_for_each_entry(info, &domain->devices, link) {
1470                 struct pci_dev *pdev;
1471
1472                 if (!info->dev || !dev_is_pci(info->dev))
1473                         continue;
1474
1475                 pdev = to_pci_dev(info->dev);
1476                 if (pdev->ats_enabled) {
1477                         has_iotlb_device = true;
1478                         break;
1479                 }
1480         }
1481
1482         domain->has_iotlb_device = has_iotlb_device;
1483 }
1484
1485 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1486 {
1487         struct pci_dev *pdev;
1488
1489         assert_spin_locked(&device_domain_lock);
1490
1491         if (!info || !dev_is_pci(info->dev))
1492                 return;
1493
1494         pdev = to_pci_dev(info->dev);
1495         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1496          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1497          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1498          * reserved, which should be set to 0.
1499          */
1500         if (!ecap_dit(info->iommu->ecap))
1501                 info->pfsid = 0;
1502         else {
1503                 struct pci_dev *pf_pdev;
1504
1505                 /* pdev will be returned if device is not a vf */
1506                 pf_pdev = pci_physfn(pdev);
1507                 info->pfsid = pci_dev_id(pf_pdev);
1508         }
1509
1510 #ifdef CONFIG_INTEL_IOMMU_SVM
1511         /* The PCIe spec, in its wisdom, declares that the behaviour of
1512            the device if you enable PASID support after ATS support is
1513            undefined. So always enable PASID support on devices which
1514            have it, even if we can't yet know if we're ever going to
1515            use it. */
1516         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1517                 info->pasid_enabled = 1;
1518
1519         if (info->pri_supported &&
1520             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1521             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1522                 info->pri_enabled = 1;
1523 #endif
1524         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1525             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1526                 info->ats_enabled = 1;
1527                 domain_update_iotlb(info->domain);
1528                 info->ats_qdep = pci_ats_queue_depth(pdev);
1529         }
1530 }
1531
1532 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1533 {
1534         struct pci_dev *pdev;
1535
1536         assert_spin_locked(&device_domain_lock);
1537
1538         if (!dev_is_pci(info->dev))
1539                 return;
1540
1541         pdev = to_pci_dev(info->dev);
1542
1543         if (info->ats_enabled) {
1544                 pci_disable_ats(pdev);
1545                 info->ats_enabled = 0;
1546                 domain_update_iotlb(info->domain);
1547         }
1548 #ifdef CONFIG_INTEL_IOMMU_SVM
1549         if (info->pri_enabled) {
1550                 pci_disable_pri(pdev);
1551                 info->pri_enabled = 0;
1552         }
1553         if (info->pasid_enabled) {
1554                 pci_disable_pasid(pdev);
1555                 info->pasid_enabled = 0;
1556         }
1557 #endif
1558 }
1559
1560 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1561                                   u64 addr, unsigned mask)
1562 {
1563         u16 sid, qdep;
1564         unsigned long flags;
1565         struct device_domain_info *info;
1566
1567         if (!domain->has_iotlb_device)
1568                 return;
1569
1570         spin_lock_irqsave(&device_domain_lock, flags);
1571         list_for_each_entry(info, &domain->devices, link) {
1572                 if (!info->ats_enabled)
1573                         continue;
1574
1575                 sid = info->bus << 8 | info->devfn;
1576                 qdep = info->ats_qdep;
1577                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1578                                 qdep, addr, mask);
1579         }
1580         spin_unlock_irqrestore(&device_domain_lock, flags);
1581 }
1582
1583 static void domain_flush_piotlb(struct intel_iommu *iommu,
1584                                 struct dmar_domain *domain,
1585                                 u64 addr, unsigned long npages, bool ih)
1586 {
1587         u16 did = domain->iommu_did[iommu->seq_id];
1588
1589         if (domain->default_pasid)
1590                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1591                                 addr, npages, ih);
1592
1593         if (!list_empty(&domain->devices))
1594                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1595 }
1596
1597 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1598                                   struct dmar_domain *domain,
1599                                   unsigned long pfn, unsigned int pages,
1600                                   int ih, int map)
1601 {
1602         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1603         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1604         u16 did = domain->iommu_did[iommu->seq_id];
1605
1606         BUG_ON(pages == 0);
1607
1608         if (ih)
1609                 ih = 1 << 6;
1610
1611         if (domain_use_first_level(domain)) {
1612                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1613         } else {
1614                 /*
1615                  * Fallback to domain selective flush if no PSI support or
1616                  * the size is too big. PSI requires page size to be 2 ^ x,
1617                  * and the base address is naturally aligned to the size.
1618                  */
1619                 if (!cap_pgsel_inv(iommu->cap) ||
1620                     mask > cap_max_amask_val(iommu->cap))
1621                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1622                                                         DMA_TLB_DSI_FLUSH);
1623                 else
1624                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1625                                                         DMA_TLB_PSI_FLUSH);
1626         }
1627
1628         /*
1629          * In caching mode, changes of pages from non-present to present require
1630          * flush. However, device IOTLB doesn't need to be flushed in this case.
1631          */
1632         if (!cap_caching_mode(iommu->cap) || !map)
1633                 iommu_flush_dev_iotlb(domain, addr, mask);
1634 }
1635
1636 /* Notification for newly created mappings */
1637 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1638                                         struct dmar_domain *domain,
1639                                         unsigned long pfn, unsigned int pages)
1640 {
1641         /*
1642          * It's a non-present to present mapping. Only flush if caching mode
1643          * and second level.
1644          */
1645         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1646                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1647         else
1648                 iommu_flush_write_buffer(iommu);
1649 }
1650
1651 static void iommu_flush_iova(struct iova_domain *iovad)
1652 {
1653         struct dmar_domain *domain;
1654         int idx;
1655
1656         domain = container_of(iovad, struct dmar_domain, iovad);
1657
1658         for_each_domain_iommu(idx, domain) {
1659                 struct intel_iommu *iommu = g_iommus[idx];
1660                 u16 did = domain->iommu_did[iommu->seq_id];
1661
1662                 if (domain_use_first_level(domain))
1663                         domain_flush_piotlb(iommu, domain, 0, -1, 0);
1664                 else
1665                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1666                                                  DMA_TLB_DSI_FLUSH);
1667
1668                 if (!cap_caching_mode(iommu->cap))
1669                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1670                                               0, MAX_AGAW_PFN_WIDTH);
1671         }
1672 }
1673
1674 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1675 {
1676         u32 pmen;
1677         unsigned long flags;
1678
1679         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1680                 return;
1681
1682         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1683         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1684         pmen &= ~DMA_PMEN_EPM;
1685         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1686
1687         /* wait for the protected region status bit to clear */
1688         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1689                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1690
1691         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1692 }
1693
1694 static void iommu_enable_translation(struct intel_iommu *iommu)
1695 {
1696         u32 sts;
1697         unsigned long flags;
1698
1699         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1700         iommu->gcmd |= DMA_GCMD_TE;
1701         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1702
1703         /* Make sure hardware complete it */
1704         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1705                       readl, (sts & DMA_GSTS_TES), sts);
1706
1707         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1708 }
1709
1710 static void iommu_disable_translation(struct intel_iommu *iommu)
1711 {
1712         u32 sts;
1713         unsigned long flag;
1714
1715         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1716             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1717                 return;
1718
1719         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1720         iommu->gcmd &= ~DMA_GCMD_TE;
1721         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1722
1723         /* Make sure hardware complete it */
1724         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1725                       readl, (!(sts & DMA_GSTS_TES)), sts);
1726
1727         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1728 }
1729
1730 static int iommu_init_domains(struct intel_iommu *iommu)
1731 {
1732         u32 ndomains, nlongs;
1733         size_t size;
1734
1735         ndomains = cap_ndoms(iommu->cap);
1736         pr_debug("%s: Number of Domains supported <%d>\n",
1737                  iommu->name, ndomains);
1738         nlongs = BITS_TO_LONGS(ndomains);
1739
1740         spin_lock_init(&iommu->lock);
1741
1742         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1743         if (!iommu->domain_ids) {
1744                 pr_err("%s: Allocating domain id array failed\n",
1745                        iommu->name);
1746                 return -ENOMEM;
1747         }
1748
1749         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1750         iommu->domains = kzalloc(size, GFP_KERNEL);
1751
1752         if (iommu->domains) {
1753                 size = 256 * sizeof(struct dmar_domain *);
1754                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1755         }
1756
1757         if (!iommu->domains || !iommu->domains[0]) {
1758                 pr_err("%s: Allocating domain array failed\n",
1759                        iommu->name);
1760                 kfree(iommu->domain_ids);
1761                 kfree(iommu->domains);
1762                 iommu->domain_ids = NULL;
1763                 iommu->domains    = NULL;
1764                 return -ENOMEM;
1765         }
1766
1767         /*
1768          * If Caching mode is set, then invalid translations are tagged
1769          * with domain-id 0, hence we need to pre-allocate it. We also
1770          * use domain-id 0 as a marker for non-allocated domain-id, so
1771          * make sure it is not used for a real domain.
1772          */
1773         set_bit(0, iommu->domain_ids);
1774
1775         /*
1776          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1777          * entry for first-level or pass-through translation modes should
1778          * be programmed with a domain id different from those used for
1779          * second-level or nested translation. We reserve a domain id for
1780          * this purpose.
1781          */
1782         if (sm_supported(iommu))
1783                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1784
1785         return 0;
1786 }
1787
1788 static void disable_dmar_iommu(struct intel_iommu *iommu)
1789 {
1790         struct device_domain_info *info, *tmp;
1791         unsigned long flags;
1792
1793         if (!iommu->domains || !iommu->domain_ids)
1794                 return;
1795
1796         spin_lock_irqsave(&device_domain_lock, flags);
1797         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1798                 if (info->iommu != iommu)
1799                         continue;
1800
1801                 if (!info->dev || !info->domain)
1802                         continue;
1803
1804                 __dmar_remove_one_dev_info(info);
1805         }
1806         spin_unlock_irqrestore(&device_domain_lock, flags);
1807
1808         if (iommu->gcmd & DMA_GCMD_TE)
1809                 iommu_disable_translation(iommu);
1810 }
1811
1812 static void free_dmar_iommu(struct intel_iommu *iommu)
1813 {
1814         if ((iommu->domains) && (iommu->domain_ids)) {
1815                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1816                 int i;
1817
1818                 for (i = 0; i < elems; i++)
1819                         kfree(iommu->domains[i]);
1820                 kfree(iommu->domains);
1821                 kfree(iommu->domain_ids);
1822                 iommu->domains = NULL;
1823                 iommu->domain_ids = NULL;
1824         }
1825
1826         g_iommus[iommu->seq_id] = NULL;
1827
1828         /* free context mapping */
1829         free_context_table(iommu);
1830
1831 #ifdef CONFIG_INTEL_IOMMU_SVM
1832         if (pasid_supported(iommu)) {
1833                 if (ecap_prs(iommu->ecap))
1834                         intel_svm_finish_prq(iommu);
1835         }
1836         if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1837                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1838
1839 #endif
1840 }
1841
1842 /*
1843  * Check and return whether first level is used by default for
1844  * DMA translation.
1845  */
1846 static bool first_level_by_default(void)
1847 {
1848         struct dmar_drhd_unit *drhd;
1849         struct intel_iommu *iommu;
1850         static int first_level_support = -1;
1851
1852         if (likely(first_level_support != -1))
1853                 return first_level_support;
1854
1855         first_level_support = 1;
1856
1857         rcu_read_lock();
1858         for_each_active_iommu(iommu, drhd) {
1859                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1860                         first_level_support = 0;
1861                         break;
1862                 }
1863         }
1864         rcu_read_unlock();
1865
1866         return first_level_support;
1867 }
1868
1869 static struct dmar_domain *alloc_domain(int flags)
1870 {
1871         struct dmar_domain *domain;
1872
1873         domain = alloc_domain_mem();
1874         if (!domain)
1875                 return NULL;
1876
1877         memset(domain, 0, sizeof(*domain));
1878         domain->nid = NUMA_NO_NODE;
1879         domain->flags = flags;
1880         if (first_level_by_default())
1881                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1882         domain->has_iotlb_device = false;
1883         INIT_LIST_HEAD(&domain->devices);
1884
1885         return domain;
1886 }
1887
1888 /* Must be called with iommu->lock */
1889 static int domain_attach_iommu(struct dmar_domain *domain,
1890                                struct intel_iommu *iommu)
1891 {
1892         unsigned long ndomains;
1893         int num;
1894
1895         assert_spin_locked(&device_domain_lock);
1896         assert_spin_locked(&iommu->lock);
1897
1898         domain->iommu_refcnt[iommu->seq_id] += 1;
1899         domain->iommu_count += 1;
1900         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1901                 ndomains = cap_ndoms(iommu->cap);
1902                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1903
1904                 if (num >= ndomains) {
1905                         pr_err("%s: No free domain ids\n", iommu->name);
1906                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1907                         domain->iommu_count -= 1;
1908                         return -ENOSPC;
1909                 }
1910
1911                 set_bit(num, iommu->domain_ids);
1912                 set_iommu_domain(iommu, num, domain);
1913
1914                 domain->iommu_did[iommu->seq_id] = num;
1915                 domain->nid                      = iommu->node;
1916
1917                 domain_update_iommu_cap(domain);
1918         }
1919
1920         return 0;
1921 }
1922
1923 static int domain_detach_iommu(struct dmar_domain *domain,
1924                                struct intel_iommu *iommu)
1925 {
1926         int num, count;
1927
1928         assert_spin_locked(&device_domain_lock);
1929         assert_spin_locked(&iommu->lock);
1930
1931         domain->iommu_refcnt[iommu->seq_id] -= 1;
1932         count = --domain->iommu_count;
1933         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1934                 num = domain->iommu_did[iommu->seq_id];
1935                 clear_bit(num, iommu->domain_ids);
1936                 set_iommu_domain(iommu, num, NULL);
1937
1938                 domain_update_iommu_cap(domain);
1939                 domain->iommu_did[iommu->seq_id] = 0;
1940         }
1941
1942         return count;
1943 }
1944
1945 static struct iova_domain reserved_iova_list;
1946 static struct lock_class_key reserved_rbtree_key;
1947
1948 static int dmar_init_reserved_ranges(void)
1949 {
1950         struct pci_dev *pdev = NULL;
1951         struct iova *iova;
1952         int i;
1953
1954         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1955
1956         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1957                 &reserved_rbtree_key);
1958
1959         /* IOAPIC ranges shouldn't be accessed by DMA */
1960         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1961                 IOVA_PFN(IOAPIC_RANGE_END));
1962         if (!iova) {
1963                 pr_err("Reserve IOAPIC range failed\n");
1964                 return -ENODEV;
1965         }
1966
1967         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1968         for_each_pci_dev(pdev) {
1969                 struct resource *r;
1970
1971                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1972                         r = &pdev->resource[i];
1973                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1974                                 continue;
1975                         iova = reserve_iova(&reserved_iova_list,
1976                                             IOVA_PFN(r->start),
1977                                             IOVA_PFN(r->end));
1978                         if (!iova) {
1979                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1980                                 return -ENODEV;
1981                         }
1982                 }
1983         }
1984         return 0;
1985 }
1986
1987 static inline int guestwidth_to_adjustwidth(int gaw)
1988 {
1989         int agaw;
1990         int r = (gaw - 12) % 9;
1991
1992         if (r == 0)
1993                 agaw = gaw;
1994         else
1995                 agaw = gaw + 9 - r;
1996         if (agaw > 64)
1997                 agaw = 64;
1998         return agaw;
1999 }
2000
2001 static void domain_exit(struct dmar_domain *domain)
2002 {
2003
2004         /* Remove associated devices and clear attached or cached domains */
2005         domain_remove_dev_info(domain);
2006
2007         /* destroy iovas */
2008         if (domain->domain.type == IOMMU_DOMAIN_DMA)
2009                 put_iova_domain(&domain->iovad);
2010
2011         if (domain->pgd) {
2012                 struct page *freelist;
2013
2014                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2015                 dma_free_pagelist(freelist);
2016         }
2017
2018         free_domain_mem(domain);
2019 }
2020
2021 /*
2022  * Get the PASID directory size for scalable mode context entry.
2023  * Value of X in the PDTS field of a scalable mode context entry
2024  * indicates PASID directory with 2^(X + 7) entries.
2025  */
2026 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2027 {
2028         int pds, max_pde;
2029
2030         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2031         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2032         if (pds < 7)
2033                 return 0;
2034
2035         return pds - 7;
2036 }
2037
2038 /*
2039  * Set the RID_PASID field of a scalable mode context entry. The
2040  * IOMMU hardware will use the PASID value set in this field for
2041  * DMA translations of DMA requests without PASID.
2042  */
2043 static inline void
2044 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2045 {
2046         context->hi |= pasid & ((1 << 20) - 1);
2047 }
2048
2049 /*
2050  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2051  * entry.
2052  */
2053 static inline void context_set_sm_dte(struct context_entry *context)
2054 {
2055         context->lo |= (1 << 2);
2056 }
2057
2058 /*
2059  * Set the PRE(Page Request Enable) field of a scalable mode context
2060  * entry.
2061  */
2062 static inline void context_set_sm_pre(struct context_entry *context)
2063 {
2064         context->lo |= (1 << 4);
2065 }
2066
2067 /* Convert value to context PASID directory size field coding. */
2068 #define context_pdts(pds)       (((pds) & 0x7) << 9)
2069
2070 static int domain_context_mapping_one(struct dmar_domain *domain,
2071                                       struct intel_iommu *iommu,
2072                                       struct pasid_table *table,
2073                                       u8 bus, u8 devfn)
2074 {
2075         u16 did = domain->iommu_did[iommu->seq_id];
2076         int translation = CONTEXT_TT_MULTI_LEVEL;
2077         struct device_domain_info *info = NULL;
2078         struct context_entry *context;
2079         unsigned long flags;
2080         int ret;
2081
2082         WARN_ON(did == 0);
2083
2084         if (hw_pass_through && domain_type_is_si(domain))
2085                 translation = CONTEXT_TT_PASS_THROUGH;
2086
2087         pr_debug("Set context mapping for %02x:%02x.%d\n",
2088                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2089
2090         BUG_ON(!domain->pgd);
2091
2092         spin_lock_irqsave(&device_domain_lock, flags);
2093         spin_lock(&iommu->lock);
2094
2095         ret = -ENOMEM;
2096         context = iommu_context_addr(iommu, bus, devfn, 1);
2097         if (!context)
2098                 goto out_unlock;
2099
2100         ret = 0;
2101         if (context_present(context))
2102                 goto out_unlock;
2103
2104         /*
2105          * For kdump cases, old valid entries may be cached due to the
2106          * in-flight DMA and copied pgtable, but there is no unmapping
2107          * behaviour for them, thus we need an explicit cache flush for
2108          * the newly-mapped device. For kdump, at this point, the device
2109          * is supposed to finish reset at its driver probe stage, so no
2110          * in-flight DMA will exist, and we don't need to worry anymore
2111          * hereafter.
2112          */
2113         if (context_copied(context)) {
2114                 u16 did_old = context_domain_id(context);
2115
2116                 if (did_old < cap_ndoms(iommu->cap)) {
2117                         iommu->flush.flush_context(iommu, did_old,
2118                                                    (((u16)bus) << 8) | devfn,
2119                                                    DMA_CCMD_MASK_NOBIT,
2120                                                    DMA_CCMD_DEVICE_INVL);
2121                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2122                                                  DMA_TLB_DSI_FLUSH);
2123                 }
2124         }
2125
2126         context_clear_entry(context);
2127
2128         if (sm_supported(iommu)) {
2129                 unsigned long pds;
2130
2131                 WARN_ON(!table);
2132
2133                 /* Setup the PASID DIR pointer: */
2134                 pds = context_get_sm_pds(table);
2135                 context->lo = (u64)virt_to_phys(table->table) |
2136                                 context_pdts(pds);
2137
2138                 /* Setup the RID_PASID field: */
2139                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2140
2141                 /*
2142                  * Setup the Device-TLB enable bit and Page request
2143                  * Enable bit:
2144                  */
2145                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2146                 if (info && info->ats_supported)
2147                         context_set_sm_dte(context);
2148                 if (info && info->pri_supported)
2149                         context_set_sm_pre(context);
2150         } else {
2151                 struct dma_pte *pgd = domain->pgd;
2152                 int agaw;
2153
2154                 context_set_domain_id(context, did);
2155
2156                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2157                         /*
2158                          * Skip top levels of page tables for iommu which has
2159                          * less agaw than default. Unnecessary for PT mode.
2160                          */
2161                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2162                                 ret = -ENOMEM;
2163                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2164                                 if (!dma_pte_present(pgd))
2165                                         goto out_unlock;
2166                         }
2167
2168                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2169                         if (info && info->ats_supported)
2170                                 translation = CONTEXT_TT_DEV_IOTLB;
2171                         else
2172                                 translation = CONTEXT_TT_MULTI_LEVEL;
2173
2174                         context_set_address_root(context, virt_to_phys(pgd));
2175                         context_set_address_width(context, agaw);
2176                 } else {
2177                         /*
2178                          * In pass through mode, AW must be programmed to
2179                          * indicate the largest AGAW value supported by
2180                          * hardware. And ASR is ignored by hardware.
2181                          */
2182                         context_set_address_width(context, iommu->msagaw);
2183                 }
2184
2185                 context_set_translation_type(context, translation);
2186         }
2187
2188         context_set_fault_enable(context);
2189         context_set_present(context);
2190         if (!ecap_coherent(iommu->ecap))
2191                 clflush_cache_range(context, sizeof(*context));
2192
2193         /*
2194          * It's a non-present to present mapping. If hardware doesn't cache
2195          * non-present entry we only need to flush the write-buffer. If the
2196          * _does_ cache non-present entries, then it does so in the special
2197          * domain #0, which we have to flush:
2198          */
2199         if (cap_caching_mode(iommu->cap)) {
2200                 iommu->flush.flush_context(iommu, 0,
2201                                            (((u16)bus) << 8) | devfn,
2202                                            DMA_CCMD_MASK_NOBIT,
2203                                            DMA_CCMD_DEVICE_INVL);
2204                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2205         } else {
2206                 iommu_flush_write_buffer(iommu);
2207         }
2208         iommu_enable_dev_iotlb(info);
2209
2210         ret = 0;
2211
2212 out_unlock:
2213         spin_unlock(&iommu->lock);
2214         spin_unlock_irqrestore(&device_domain_lock, flags);
2215
2216         return ret;
2217 }
2218
2219 struct domain_context_mapping_data {
2220         struct dmar_domain *domain;
2221         struct intel_iommu *iommu;
2222         struct pasid_table *table;
2223 };
2224
2225 static int domain_context_mapping_cb(struct pci_dev *pdev,
2226                                      u16 alias, void *opaque)
2227 {
2228         struct domain_context_mapping_data *data = opaque;
2229
2230         return domain_context_mapping_one(data->domain, data->iommu,
2231                                           data->table, PCI_BUS_NUM(alias),
2232                                           alias & 0xff);
2233 }
2234
2235 static int
2236 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2237 {
2238         struct domain_context_mapping_data data;
2239         struct pasid_table *table;
2240         struct intel_iommu *iommu;
2241         u8 bus, devfn;
2242
2243         iommu = device_to_iommu(dev, &bus, &devfn);
2244         if (!iommu)
2245                 return -ENODEV;
2246
2247         table = intel_pasid_get_table(dev);
2248
2249         if (!dev_is_pci(dev))
2250                 return domain_context_mapping_one(domain, iommu, table,
2251                                                   bus, devfn);
2252
2253         data.domain = domain;
2254         data.iommu = iommu;
2255         data.table = table;
2256
2257         return pci_for_each_dma_alias(to_pci_dev(dev),
2258                                       &domain_context_mapping_cb, &data);
2259 }
2260
2261 static int domain_context_mapped_cb(struct pci_dev *pdev,
2262                                     u16 alias, void *opaque)
2263 {
2264         struct intel_iommu *iommu = opaque;
2265
2266         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2267 }
2268
2269 static int domain_context_mapped(struct device *dev)
2270 {
2271         struct intel_iommu *iommu;
2272         u8 bus, devfn;
2273
2274         iommu = device_to_iommu(dev, &bus, &devfn);
2275         if (!iommu)
2276                 return -ENODEV;
2277
2278         if (!dev_is_pci(dev))
2279                 return device_context_mapped(iommu, bus, devfn);
2280
2281         return !pci_for_each_dma_alias(to_pci_dev(dev),
2282                                        domain_context_mapped_cb, iommu);
2283 }
2284
2285 /* Returns a number of VTD pages, but aligned to MM page size */
2286 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2287                                             size_t size)
2288 {
2289         host_addr &= ~PAGE_MASK;
2290         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2291 }
2292
2293 /* Return largest possible superpage level for a given mapping */
2294 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2295                                           unsigned long iov_pfn,
2296                                           unsigned long phy_pfn,
2297                                           unsigned long pages)
2298 {
2299         int support, level = 1;
2300         unsigned long pfnmerge;
2301
2302         support = domain->iommu_superpage;
2303
2304         /* To use a large page, the virtual *and* physical addresses
2305            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2306            of them will mean we have to use smaller pages. So just
2307            merge them and check both at once. */
2308         pfnmerge = iov_pfn | phy_pfn;
2309
2310         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2311                 pages >>= VTD_STRIDE_SHIFT;
2312                 if (!pages)
2313                         break;
2314                 pfnmerge >>= VTD_STRIDE_SHIFT;
2315                 level++;
2316                 support--;
2317         }
2318         return level;
2319 }
2320
2321 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2322                             struct scatterlist *sg, unsigned long phys_pfn,
2323                             unsigned long nr_pages, int prot)
2324 {
2325         struct dma_pte *first_pte = NULL, *pte = NULL;
2326         phys_addr_t pteval;
2327         unsigned long sg_res = 0;
2328         unsigned int largepage_lvl = 0;
2329         unsigned long lvl_pages = 0;
2330         u64 attr;
2331
2332         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2333
2334         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2335                 return -EINVAL;
2336
2337         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2338         if (domain_use_first_level(domain))
2339                 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2340
2341         if (!sg) {
2342                 sg_res = nr_pages;
2343                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2344         }
2345
2346         while (nr_pages > 0) {
2347                 uint64_t tmp;
2348
2349                 if (!sg_res) {
2350                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2351
2352                         sg_res = aligned_nrpages(sg->offset, sg->length);
2353                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2354                         sg->dma_length = sg->length;
2355                         pteval = (sg_phys(sg) - pgoff) | attr;
2356                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2357                 }
2358
2359                 if (!pte) {
2360                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2361
2362                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2363                         if (!pte)
2364                                 return -ENOMEM;
2365                         /* It is large page*/
2366                         if (largepage_lvl > 1) {
2367                                 unsigned long nr_superpages, end_pfn;
2368
2369                                 pteval |= DMA_PTE_LARGE_PAGE;
2370                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2371
2372                                 nr_superpages = sg_res / lvl_pages;
2373                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2374
2375                                 /*
2376                                  * Ensure that old small page tables are
2377                                  * removed to make room for superpage(s).
2378                                  * We're adding new large pages, so make sure
2379                                  * we don't remove their parent tables.
2380                                  */
2381                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2382                                                        largepage_lvl + 1);
2383                         } else {
2384                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2385                         }
2386
2387                 }
2388                 /* We don't need lock here, nobody else
2389                  * touches the iova range
2390                  */
2391                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2392                 if (tmp) {
2393                         static int dumps = 5;
2394                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2395                                 iov_pfn, tmp, (unsigned long long)pteval);
2396                         if (dumps) {
2397                                 dumps--;
2398                                 debug_dma_dump_mappings(NULL);
2399                         }
2400                         WARN_ON(1);
2401                 }
2402
2403                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2404
2405                 BUG_ON(nr_pages < lvl_pages);
2406                 BUG_ON(sg_res < lvl_pages);
2407
2408                 nr_pages -= lvl_pages;
2409                 iov_pfn += lvl_pages;
2410                 phys_pfn += lvl_pages;
2411                 pteval += lvl_pages * VTD_PAGE_SIZE;
2412                 sg_res -= lvl_pages;
2413
2414                 /* If the next PTE would be the first in a new page, then we
2415                    need to flush the cache on the entries we've just written.
2416                    And then we'll need to recalculate 'pte', so clear it and
2417                    let it get set again in the if (!pte) block above.
2418
2419                    If we're done (!nr_pages) we need to flush the cache too.
2420
2421                    Also if we've been setting superpages, we may need to
2422                    recalculate 'pte' and switch back to smaller pages for the
2423                    end of the mapping, if the trailing size is not enough to
2424                    use another superpage (i.e. sg_res < lvl_pages). */
2425                 pte++;
2426                 if (!nr_pages || first_pte_in_page(pte) ||
2427                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2428                         domain_flush_cache(domain, first_pte,
2429                                            (void *)pte - (void *)first_pte);
2430                         pte = NULL;
2431                 }
2432
2433                 if (!sg_res && nr_pages)
2434                         sg = sg_next(sg);
2435         }
2436         return 0;
2437 }
2438
2439 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2440                           struct scatterlist *sg, unsigned long phys_pfn,
2441                           unsigned long nr_pages, int prot)
2442 {
2443         int iommu_id, ret;
2444         struct intel_iommu *iommu;
2445
2446         /* Do the real mapping first */
2447         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2448         if (ret)
2449                 return ret;
2450
2451         for_each_domain_iommu(iommu_id, domain) {
2452                 iommu = g_iommus[iommu_id];
2453                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2454         }
2455
2456         return 0;
2457 }
2458
2459 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2460                                     struct scatterlist *sg, unsigned long nr_pages,
2461                                     int prot)
2462 {
2463         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2464 }
2465
2466 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2467                                      unsigned long phys_pfn, unsigned long nr_pages,
2468                                      int prot)
2469 {
2470         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2471 }
2472
2473 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2474 {
2475         unsigned long flags;
2476         struct context_entry *context;
2477         u16 did_old;
2478
2479         if (!iommu)
2480                 return;
2481
2482         spin_lock_irqsave(&iommu->lock, flags);
2483         context = iommu_context_addr(iommu, bus, devfn, 0);
2484         if (!context) {
2485                 spin_unlock_irqrestore(&iommu->lock, flags);
2486                 return;
2487         }
2488         did_old = context_domain_id(context);
2489         context_clear_entry(context);
2490         __iommu_flush_cache(iommu, context, sizeof(*context));
2491         spin_unlock_irqrestore(&iommu->lock, flags);
2492         iommu->flush.flush_context(iommu,
2493                                    did_old,
2494                                    (((u16)bus) << 8) | devfn,
2495                                    DMA_CCMD_MASK_NOBIT,
2496                                    DMA_CCMD_DEVICE_INVL);
2497         iommu->flush.flush_iotlb(iommu,
2498                                  did_old,
2499                                  0,
2500                                  0,
2501                                  DMA_TLB_DSI_FLUSH);
2502 }
2503
2504 static inline void unlink_domain_info(struct device_domain_info *info)
2505 {
2506         assert_spin_locked(&device_domain_lock);
2507         list_del(&info->link);
2508         list_del(&info->global);
2509         if (info->dev)
2510                 dev_iommu_priv_set(info->dev, NULL);
2511 }
2512
2513 static void domain_remove_dev_info(struct dmar_domain *domain)
2514 {
2515         struct device_domain_info *info, *tmp;
2516         unsigned long flags;
2517
2518         spin_lock_irqsave(&device_domain_lock, flags);
2519         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2520                 __dmar_remove_one_dev_info(info);
2521         spin_unlock_irqrestore(&device_domain_lock, flags);
2522 }
2523
2524 struct dmar_domain *find_domain(struct device *dev)
2525 {
2526         struct device_domain_info *info;
2527
2528         if (unlikely(!dev || !dev->iommu))
2529                 return NULL;
2530
2531         if (unlikely(attach_deferred(dev)))
2532                 return NULL;
2533
2534         /* No lock here, assumes no domain exit in normal case */
2535         info = get_domain_info(dev);
2536         if (likely(info))
2537                 return info->domain;
2538
2539         return NULL;
2540 }
2541
2542 static void do_deferred_attach(struct device *dev)
2543 {
2544         struct iommu_domain *domain;
2545
2546         dev_iommu_priv_set(dev, NULL);
2547         domain = iommu_get_domain_for_dev(dev);
2548         if (domain)
2549                 intel_iommu_attach_device(domain, dev);
2550 }
2551
2552 static inline struct device_domain_info *
2553 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2554 {
2555         struct device_domain_info *info;
2556
2557         list_for_each_entry(info, &device_domain_list, global)
2558                 if (info->segment == segment && info->bus == bus &&
2559                     info->devfn == devfn)
2560                         return info;
2561
2562         return NULL;
2563 }
2564
2565 static int domain_setup_first_level(struct intel_iommu *iommu,
2566                                     struct dmar_domain *domain,
2567                                     struct device *dev,
2568                                     u32 pasid)
2569 {
2570         int flags = PASID_FLAG_SUPERVISOR_MODE;
2571         struct dma_pte *pgd = domain->pgd;
2572         int agaw, level;
2573
2574         /*
2575          * Skip top levels of page tables for iommu which has
2576          * less agaw than default. Unnecessary for PT mode.
2577          */
2578         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2579                 pgd = phys_to_virt(dma_pte_addr(pgd));
2580                 if (!dma_pte_present(pgd))
2581                         return -ENOMEM;
2582         }
2583
2584         level = agaw_to_level(agaw);
2585         if (level != 4 && level != 5)
2586                 return -EINVAL;
2587
2588         flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2589
2590         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2591                                              domain->iommu_did[iommu->seq_id],
2592                                              flags);
2593 }
2594
2595 static bool dev_is_real_dma_subdevice(struct device *dev)
2596 {
2597         return dev && dev_is_pci(dev) &&
2598                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2599 }
2600
2601 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2602                                                     int bus, int devfn,
2603                                                     struct device *dev,
2604                                                     struct dmar_domain *domain)
2605 {
2606         struct dmar_domain *found = NULL;
2607         struct device_domain_info *info;
2608         unsigned long flags;
2609         int ret;
2610
2611         info = alloc_devinfo_mem();
2612         if (!info)
2613                 return NULL;
2614
2615         if (!dev_is_real_dma_subdevice(dev)) {
2616                 info->bus = bus;
2617                 info->devfn = devfn;
2618                 info->segment = iommu->segment;
2619         } else {
2620                 struct pci_dev *pdev = to_pci_dev(dev);
2621
2622                 info->bus = pdev->bus->number;
2623                 info->devfn = pdev->devfn;
2624                 info->segment = pci_domain_nr(pdev->bus);
2625         }
2626
2627         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2628         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2629         info->ats_qdep = 0;
2630         info->dev = dev;
2631         info->domain = domain;
2632         info->iommu = iommu;
2633         info->pasid_table = NULL;
2634         info->auxd_enabled = 0;
2635         INIT_LIST_HEAD(&info->auxiliary_domains);
2636
2637         if (dev && dev_is_pci(dev)) {
2638                 struct pci_dev *pdev = to_pci_dev(info->dev);
2639
2640                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2641                     pci_ats_supported(pdev) &&
2642                     dmar_find_matched_atsr_unit(pdev))
2643                         info->ats_supported = 1;
2644
2645                 if (sm_supported(iommu)) {
2646                         if (pasid_supported(iommu)) {
2647                                 int features = pci_pasid_features(pdev);
2648                                 if (features >= 0)
2649                                         info->pasid_supported = features | 1;
2650                         }
2651
2652                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2653                             pci_pri_supported(pdev))
2654                                 info->pri_supported = 1;
2655                 }
2656         }
2657
2658         spin_lock_irqsave(&device_domain_lock, flags);
2659         if (dev)
2660                 found = find_domain(dev);
2661
2662         if (!found) {
2663                 struct device_domain_info *info2;
2664                 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2665                                                        info->devfn);
2666                 if (info2) {
2667                         found      = info2->domain;
2668                         info2->dev = dev;
2669                 }
2670         }
2671
2672         if (found) {
2673                 spin_unlock_irqrestore(&device_domain_lock, flags);
2674                 free_devinfo_mem(info);
2675                 /* Caller must free the original domain */
2676                 return found;
2677         }
2678
2679         spin_lock(&iommu->lock);
2680         ret = domain_attach_iommu(domain, iommu);
2681         spin_unlock(&iommu->lock);
2682
2683         if (ret) {
2684                 spin_unlock_irqrestore(&device_domain_lock, flags);
2685                 free_devinfo_mem(info);
2686                 return NULL;
2687         }
2688
2689         list_add(&info->link, &domain->devices);
2690         list_add(&info->global, &device_domain_list);
2691         if (dev)
2692                 dev_iommu_priv_set(dev, info);
2693         spin_unlock_irqrestore(&device_domain_lock, flags);
2694
2695         /* PASID table is mandatory for a PCI device in scalable mode. */
2696         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2697                 ret = intel_pasid_alloc_table(dev);
2698                 if (ret) {
2699                         dev_err(dev, "PASID table allocation failed\n");
2700                         dmar_remove_one_dev_info(dev);
2701                         return NULL;
2702                 }
2703
2704                 /* Setup the PASID entry for requests without PASID: */
2705                 spin_lock_irqsave(&iommu->lock, flags);
2706                 if (hw_pass_through && domain_type_is_si(domain))
2707                         ret = intel_pasid_setup_pass_through(iommu, domain,
2708                                         dev, PASID_RID2PASID);
2709                 else if (domain_use_first_level(domain))
2710                         ret = domain_setup_first_level(iommu, domain, dev,
2711                                         PASID_RID2PASID);
2712                 else
2713                         ret = intel_pasid_setup_second_level(iommu, domain,
2714                                         dev, PASID_RID2PASID);
2715                 spin_unlock_irqrestore(&iommu->lock, flags);
2716                 if (ret) {
2717                         dev_err(dev, "Setup RID2PASID failed\n");
2718                         dmar_remove_one_dev_info(dev);
2719                         return NULL;
2720                 }
2721         }
2722
2723         if (dev && domain_context_mapping(domain, dev)) {
2724                 dev_err(dev, "Domain context map failed\n");
2725                 dmar_remove_one_dev_info(dev);
2726                 return NULL;
2727         }
2728
2729         return domain;
2730 }
2731
2732 static int iommu_domain_identity_map(struct dmar_domain *domain,
2733                                      unsigned long first_vpfn,
2734                                      unsigned long last_vpfn)
2735 {
2736         /*
2737          * RMRR range might have overlap with physical memory range,
2738          * clear it first
2739          */
2740         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2741
2742         return __domain_mapping(domain, first_vpfn, NULL,
2743                                 first_vpfn, last_vpfn - first_vpfn + 1,
2744                                 DMA_PTE_READ|DMA_PTE_WRITE);
2745 }
2746
2747 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2748
2749 static int __init si_domain_init(int hw)
2750 {
2751         struct dmar_rmrr_unit *rmrr;
2752         struct device *dev;
2753         int i, nid, ret;
2754
2755         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2756         if (!si_domain)
2757                 return -EFAULT;
2758
2759         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2760                 domain_exit(si_domain);
2761                 return -EFAULT;
2762         }
2763
2764         if (hw)
2765                 return 0;
2766
2767         for_each_online_node(nid) {
2768                 unsigned long start_pfn, end_pfn;
2769                 int i;
2770
2771                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2772                         ret = iommu_domain_identity_map(si_domain,
2773                                         mm_to_dma_pfn(start_pfn),
2774                                         mm_to_dma_pfn(end_pfn));
2775                         if (ret)
2776                                 return ret;
2777                 }
2778         }
2779
2780         /*
2781          * Identity map the RMRRs so that devices with RMRRs could also use
2782          * the si_domain.
2783          */
2784         for_each_rmrr_units(rmrr) {
2785                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2786                                           i, dev) {
2787                         unsigned long long start = rmrr->base_address;
2788                         unsigned long long end = rmrr->end_address;
2789
2790                         if (WARN_ON(end < start ||
2791                                     end >> agaw_to_width(si_domain->agaw)))
2792                                 continue;
2793
2794                         ret = iommu_domain_identity_map(si_domain,
2795                                         mm_to_dma_pfn(start >> PAGE_SHIFT),
2796                                         mm_to_dma_pfn(end >> PAGE_SHIFT));
2797                         if (ret)
2798                                 return ret;
2799                 }
2800         }
2801
2802         return 0;
2803 }
2804
2805 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2806 {
2807         struct dmar_domain *ndomain;
2808         struct intel_iommu *iommu;
2809         u8 bus, devfn;
2810
2811         iommu = device_to_iommu(dev, &bus, &devfn);
2812         if (!iommu)
2813                 return -ENODEV;
2814
2815         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2816         if (ndomain != domain)
2817                 return -EBUSY;
2818
2819         return 0;
2820 }
2821
2822 static bool device_has_rmrr(struct device *dev)
2823 {
2824         struct dmar_rmrr_unit *rmrr;
2825         struct device *tmp;
2826         int i;
2827
2828         rcu_read_lock();
2829         for_each_rmrr_units(rmrr) {
2830                 /*
2831                  * Return TRUE if this RMRR contains the device that
2832                  * is passed in.
2833                  */
2834                 for_each_active_dev_scope(rmrr->devices,
2835                                           rmrr->devices_cnt, i, tmp)
2836                         if (tmp == dev ||
2837                             is_downstream_to_pci_bridge(dev, tmp)) {
2838                                 rcu_read_unlock();
2839                                 return true;
2840                         }
2841         }
2842         rcu_read_unlock();
2843         return false;
2844 }
2845
2846 /**
2847  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2848  * is relaxable (ie. is allowed to be not enforced under some conditions)
2849  * @dev: device handle
2850  *
2851  * We assume that PCI USB devices with RMRRs have them largely
2852  * for historical reasons and that the RMRR space is not actively used post
2853  * boot.  This exclusion may change if vendors begin to abuse it.
2854  *
2855  * The same exception is made for graphics devices, with the requirement that
2856  * any use of the RMRR regions will be torn down before assigning the device
2857  * to a guest.
2858  *
2859  * Return: true if the RMRR is relaxable, false otherwise
2860  */
2861 static bool device_rmrr_is_relaxable(struct device *dev)
2862 {
2863         struct pci_dev *pdev;
2864
2865         if (!dev_is_pci(dev))
2866                 return false;
2867
2868         pdev = to_pci_dev(dev);
2869         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2870                 return true;
2871         else
2872                 return false;
2873 }
2874
2875 /*
2876  * There are a couple cases where we need to restrict the functionality of
2877  * devices associated with RMRRs.  The first is when evaluating a device for
2878  * identity mapping because problems exist when devices are moved in and out
2879  * of domains and their respective RMRR information is lost.  This means that
2880  * a device with associated RMRRs will never be in a "passthrough" domain.
2881  * The second is use of the device through the IOMMU API.  This interface
2882  * expects to have full control of the IOVA space for the device.  We cannot
2883  * satisfy both the requirement that RMRR access is maintained and have an
2884  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2885  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2886  * We therefore prevent devices associated with an RMRR from participating in
2887  * the IOMMU API, which eliminates them from device assignment.
2888  *
2889  * In both cases, devices which have relaxable RMRRs are not concerned by this
2890  * restriction. See device_rmrr_is_relaxable comment.
2891  */
2892 static bool device_is_rmrr_locked(struct device *dev)
2893 {
2894         if (!device_has_rmrr(dev))
2895                 return false;
2896
2897         if (device_rmrr_is_relaxable(dev))
2898                 return false;
2899
2900         return true;
2901 }
2902
2903 /*
2904  * Return the required default domain type for a specific device.
2905  *
2906  * @dev: the device in query
2907  * @startup: true if this is during early boot
2908  *
2909  * Returns:
2910  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2911  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2912  *  - 0: both identity and dynamic domains work for this device
2913  */
2914 static int device_def_domain_type(struct device *dev)
2915 {
2916         if (dev_is_pci(dev)) {
2917                 struct pci_dev *pdev = to_pci_dev(dev);
2918
2919                 /*
2920                  * Prevent any device marked as untrusted from getting
2921                  * placed into the statically identity mapping domain.
2922                  */
2923                 if (pdev->untrusted)
2924                         return IOMMU_DOMAIN_DMA;
2925
2926                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2927                         return IOMMU_DOMAIN_IDENTITY;
2928
2929                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2930                         return IOMMU_DOMAIN_IDENTITY;
2931         }
2932
2933         return 0;
2934 }
2935
2936 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2937 {
2938         /*
2939          * Start from the sane iommu hardware state.
2940          * If the queued invalidation is already initialized by us
2941          * (for example, while enabling interrupt-remapping) then
2942          * we got the things already rolling from a sane state.
2943          */
2944         if (!iommu->qi) {
2945                 /*
2946                  * Clear any previous faults.
2947                  */
2948                 dmar_fault(-1, iommu);
2949                 /*
2950                  * Disable queued invalidation if supported and already enabled
2951                  * before OS handover.
2952                  */
2953                 dmar_disable_qi(iommu);
2954         }
2955
2956         if (dmar_enable_qi(iommu)) {
2957                 /*
2958                  * Queued Invalidate not enabled, use Register Based Invalidate
2959                  */
2960                 iommu->flush.flush_context = __iommu_flush_context;
2961                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2962                 pr_info("%s: Using Register based invalidation\n",
2963                         iommu->name);
2964         } else {
2965                 iommu->flush.flush_context = qi_flush_context;
2966                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2967                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2968         }
2969 }
2970
2971 static int copy_context_table(struct intel_iommu *iommu,
2972                               struct root_entry *old_re,
2973                               struct context_entry **tbl,
2974                               int bus, bool ext)
2975 {
2976         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2977         struct context_entry *new_ce = NULL, ce;
2978         struct context_entry *old_ce = NULL;
2979         struct root_entry re;
2980         phys_addr_t old_ce_phys;
2981
2982         tbl_idx = ext ? bus * 2 : bus;
2983         memcpy(&re, old_re, sizeof(re));
2984
2985         for (devfn = 0; devfn < 256; devfn++) {
2986                 /* First calculate the correct index */
2987                 idx = (ext ? devfn * 2 : devfn) % 256;
2988
2989                 if (idx == 0) {
2990                         /* First save what we may have and clean up */
2991                         if (new_ce) {
2992                                 tbl[tbl_idx] = new_ce;
2993                                 __iommu_flush_cache(iommu, new_ce,
2994                                                     VTD_PAGE_SIZE);
2995                                 pos = 1;
2996                         }
2997
2998                         if (old_ce)
2999                                 memunmap(old_ce);
3000
3001                         ret = 0;
3002                         if (devfn < 0x80)
3003                                 old_ce_phys = root_entry_lctp(&re);
3004                         else
3005                                 old_ce_phys = root_entry_uctp(&re);
3006
3007                         if (!old_ce_phys) {
3008                                 if (ext && devfn == 0) {
3009                                         /* No LCTP, try UCTP */
3010                                         devfn = 0x7f;
3011                                         continue;
3012                                 } else {
3013                                         goto out;
3014                                 }
3015                         }
3016
3017                         ret = -ENOMEM;
3018                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3019                                         MEMREMAP_WB);
3020                         if (!old_ce)
3021                                 goto out;
3022
3023                         new_ce = alloc_pgtable_page(iommu->node);
3024                         if (!new_ce)
3025                                 goto out_unmap;
3026
3027                         ret = 0;
3028                 }
3029
3030                 /* Now copy the context entry */
3031                 memcpy(&ce, old_ce + idx, sizeof(ce));
3032
3033                 if (!__context_present(&ce))
3034                         continue;
3035
3036                 did = context_domain_id(&ce);
3037                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3038                         set_bit(did, iommu->domain_ids);
3039
3040                 /*
3041                  * We need a marker for copied context entries. This
3042                  * marker needs to work for the old format as well as
3043                  * for extended context entries.
3044                  *
3045                  * Bit 67 of the context entry is used. In the old
3046                  * format this bit is available to software, in the
3047                  * extended format it is the PGE bit, but PGE is ignored
3048                  * by HW if PASIDs are disabled (and thus still
3049                  * available).
3050                  *
3051                  * So disable PASIDs first and then mark the entry
3052                  * copied. This means that we don't copy PASID
3053                  * translations from the old kernel, but this is fine as
3054                  * faults there are not fatal.
3055                  */
3056                 context_clear_pasid_enable(&ce);
3057                 context_set_copied(&ce);
3058
3059                 new_ce[idx] = ce;
3060         }
3061
3062         tbl[tbl_idx + pos] = new_ce;
3063
3064         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3065
3066 out_unmap:
3067         memunmap(old_ce);
3068
3069 out:
3070         return ret;
3071 }
3072
3073 static int copy_translation_tables(struct intel_iommu *iommu)
3074 {
3075         struct context_entry **ctxt_tbls;
3076         struct root_entry *old_rt;
3077         phys_addr_t old_rt_phys;
3078         int ctxt_table_entries;
3079         unsigned long flags;
3080         u64 rtaddr_reg;
3081         int bus, ret;
3082         bool new_ext, ext;
3083
3084         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3085         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3086         new_ext    = !!ecap_ecs(iommu->ecap);
3087
3088         /*
3089          * The RTT bit can only be changed when translation is disabled,
3090          * but disabling translation means to open a window for data
3091          * corruption. So bail out and don't copy anything if we would
3092          * have to change the bit.
3093          */
3094         if (new_ext != ext)
3095                 return -EINVAL;
3096
3097         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3098         if (!old_rt_phys)
3099                 return -EINVAL;
3100
3101         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3102         if (!old_rt)
3103                 return -ENOMEM;
3104
3105         /* This is too big for the stack - allocate it from slab */
3106         ctxt_table_entries = ext ? 512 : 256;
3107         ret = -ENOMEM;
3108         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3109         if (!ctxt_tbls)
3110                 goto out_unmap;
3111
3112         for (bus = 0; bus < 256; bus++) {
3113                 ret = copy_context_table(iommu, &old_rt[bus],
3114                                          ctxt_tbls, bus, ext);
3115                 if (ret) {
3116                         pr_err("%s: Failed to copy context table for bus %d\n",
3117                                 iommu->name, bus);
3118                         continue;
3119                 }
3120         }
3121
3122         spin_lock_irqsave(&iommu->lock, flags);
3123
3124         /* Context tables are copied, now write them to the root_entry table */
3125         for (bus = 0; bus < 256; bus++) {
3126                 int idx = ext ? bus * 2 : bus;
3127                 u64 val;
3128
3129                 if (ctxt_tbls[idx]) {
3130                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3131                         iommu->root_entry[bus].lo = val;
3132                 }
3133
3134                 if (!ext || !ctxt_tbls[idx + 1])
3135                         continue;
3136
3137                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3138                 iommu->root_entry[bus].hi = val;
3139         }
3140
3141         spin_unlock_irqrestore(&iommu->lock, flags);
3142
3143         kfree(ctxt_tbls);
3144
3145         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3146
3147         ret = 0;
3148
3149 out_unmap:
3150         memunmap(old_rt);
3151
3152         return ret;
3153 }
3154
3155 #ifdef CONFIG_INTEL_IOMMU_SVM
3156 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3157 {
3158         struct intel_iommu *iommu = data;
3159         ioasid_t ioasid;
3160
3161         if (!iommu)
3162                 return INVALID_IOASID;
3163         /*
3164          * VT-d virtual command interface always uses the full 20 bit
3165          * PASID range. Host can partition guest PASID range based on
3166          * policies but it is out of guest's control.
3167          */
3168         if (min < PASID_MIN || max > intel_pasid_max_id)
3169                 return INVALID_IOASID;
3170
3171         if (vcmd_alloc_pasid(iommu, &ioasid))
3172                 return INVALID_IOASID;
3173
3174         return ioasid;
3175 }
3176
3177 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3178 {
3179         struct intel_iommu *iommu = data;
3180
3181         if (!iommu)
3182                 return;
3183         /*
3184          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3185          * We can only free the PASID when all the devices are unbound.
3186          */
3187         if (ioasid_find(NULL, ioasid, NULL)) {
3188                 pr_alert("Cannot free active IOASID %d\n", ioasid);
3189                 return;
3190         }
3191         vcmd_free_pasid(iommu, ioasid);
3192 }
3193
3194 static void register_pasid_allocator(struct intel_iommu *iommu)
3195 {
3196         /*
3197          * If we are running in the host, no need for custom allocator
3198          * in that PASIDs are allocated from the host system-wide.
3199          */
3200         if (!cap_caching_mode(iommu->cap))
3201                 return;
3202
3203         if (!sm_supported(iommu)) {
3204                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3205                 return;
3206         }
3207
3208         /*
3209          * Register a custom PASID allocator if we are running in a guest,
3210          * guest PASID must be obtained via virtual command interface.
3211          * There can be multiple vIOMMUs in each guest but only one allocator
3212          * is active. All vIOMMU allocators will eventually be calling the same
3213          * host allocator.
3214          */
3215         if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3216                 return;
3217
3218         pr_info("Register custom PASID allocator\n");
3219         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3220         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3221         iommu->pasid_allocator.pdata = (void *)iommu;
3222         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3223                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3224                 /*
3225                  * Disable scalable mode on this IOMMU if there
3226                  * is no custom allocator. Mixing SM capable vIOMMU
3227                  * and non-SM vIOMMU are not supported.
3228                  */
3229                 intel_iommu_sm = 0;
3230         }
3231 }
3232 #endif
3233
3234 static int __init init_dmars(void)
3235 {
3236         struct dmar_drhd_unit *drhd;
3237         struct intel_iommu *iommu;
3238         int ret;
3239
3240         /*
3241          * for each drhd
3242          *    allocate root
3243          *    initialize and program root entry to not present
3244          * endfor
3245          */
3246         for_each_drhd_unit(drhd) {
3247                 /*
3248                  * lock not needed as this is only incremented in the single
3249                  * threaded kernel __init code path all other access are read
3250                  * only
3251                  */
3252                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3253                         g_num_of_iommus++;
3254                         continue;
3255                 }
3256                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3257         }
3258
3259         /* Preallocate enough resources for IOMMU hot-addition */
3260         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3261                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3262
3263         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3264                         GFP_KERNEL);
3265         if (!g_iommus) {
3266                 pr_err("Allocating global iommu array failed\n");
3267                 ret = -ENOMEM;
3268                 goto error;
3269         }
3270
3271         for_each_iommu(iommu, drhd) {
3272                 if (drhd->ignored) {
3273                         iommu_disable_translation(iommu);
3274                         continue;
3275                 }
3276
3277                 /*
3278                  * Find the max pasid size of all IOMMU's in the system.
3279                  * We need to ensure the system pasid table is no bigger
3280                  * than the smallest supported.
3281                  */
3282                 if (pasid_supported(iommu)) {
3283                         u32 temp = 2 << ecap_pss(iommu->ecap);
3284
3285                         intel_pasid_max_id = min_t(u32, temp,
3286                                                    intel_pasid_max_id);
3287                 }
3288
3289                 g_iommus[iommu->seq_id] = iommu;
3290
3291                 intel_iommu_init_qi(iommu);
3292
3293                 ret = iommu_init_domains(iommu);
3294                 if (ret)
3295                         goto free_iommu;
3296
3297                 init_translation_status(iommu);
3298
3299                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3300                         iommu_disable_translation(iommu);
3301                         clear_translation_pre_enabled(iommu);
3302                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3303                                 iommu->name);
3304                 }
3305
3306                 /*
3307                  * TBD:
3308                  * we could share the same root & context tables
3309                  * among all IOMMU's. Need to Split it later.
3310                  */
3311                 ret = iommu_alloc_root_entry(iommu);
3312                 if (ret)
3313                         goto free_iommu;
3314
3315                 if (translation_pre_enabled(iommu)) {
3316                         pr_info("Translation already enabled - trying to copy translation structures\n");
3317
3318                         ret = copy_translation_tables(iommu);
3319                         if (ret) {
3320                                 /*
3321                                  * We found the IOMMU with translation
3322                                  * enabled - but failed to copy over the
3323                                  * old root-entry table. Try to proceed
3324                                  * by disabling translation now and
3325                                  * allocating a clean root-entry table.
3326                                  * This might cause DMAR faults, but
3327                                  * probably the dump will still succeed.
3328                                  */
3329                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3330                                        iommu->name);
3331                                 iommu_disable_translation(iommu);
3332                                 clear_translation_pre_enabled(iommu);
3333                         } else {
3334                                 pr_info("Copied translation tables from previous kernel for %s\n",
3335                                         iommu->name);
3336                         }
3337                 }
3338
3339                 if (!ecap_pass_through(iommu->ecap))
3340                         hw_pass_through = 0;
3341                 intel_svm_check(iommu);
3342         }
3343
3344         /*
3345          * Now that qi is enabled on all iommus, set the root entry and flush
3346          * caches. This is required on some Intel X58 chipsets, otherwise the
3347          * flush_context function will loop forever and the boot hangs.
3348          */
3349         for_each_active_iommu(iommu, drhd) {
3350                 iommu_flush_write_buffer(iommu);
3351 #ifdef CONFIG_INTEL_IOMMU_SVM
3352                 register_pasid_allocator(iommu);
3353 #endif
3354                 iommu_set_root_entry(iommu);
3355                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3356                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3357         }
3358
3359 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3360         dmar_map_gfx = 0;
3361 #endif
3362
3363         if (!dmar_map_gfx)
3364                 iommu_identity_mapping |= IDENTMAP_GFX;
3365
3366         check_tylersburg_isoch();
3367
3368         ret = si_domain_init(hw_pass_through);
3369         if (ret)
3370                 goto free_iommu;
3371
3372         /*
3373          * for each drhd
3374          *   enable fault log
3375          *   global invalidate context cache
3376          *   global invalidate iotlb
3377          *   enable translation
3378          */
3379         for_each_iommu(iommu, drhd) {
3380                 if (drhd->ignored) {
3381                         /*
3382                          * we always have to disable PMRs or DMA may fail on
3383                          * this device
3384                          */
3385                         if (force_on)
3386                                 iommu_disable_protect_mem_regions(iommu);
3387                         continue;
3388                 }
3389
3390                 iommu_flush_write_buffer(iommu);
3391
3392 #ifdef CONFIG_INTEL_IOMMU_SVM
3393                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3394                         /*
3395                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3396                          * could cause possible lock race condition.
3397                          */
3398                         up_write(&dmar_global_lock);
3399                         ret = intel_svm_enable_prq(iommu);
3400                         down_write(&dmar_global_lock);
3401                         if (ret)
3402                                 goto free_iommu;
3403                 }
3404 #endif
3405                 ret = dmar_set_interrupt(iommu);
3406                 if (ret)
3407                         goto free_iommu;
3408         }
3409
3410         return 0;
3411
3412 free_iommu:
3413         for_each_active_iommu(iommu, drhd) {
3414                 disable_dmar_iommu(iommu);
3415                 free_dmar_iommu(iommu);
3416         }
3417
3418         kfree(g_iommus);
3419
3420 error:
3421         return ret;
3422 }
3423
3424 /* This takes a number of _MM_ pages, not VTD pages */
3425 static unsigned long intel_alloc_iova(struct device *dev,
3426                                      struct dmar_domain *domain,
3427                                      unsigned long nrpages, uint64_t dma_mask)
3428 {
3429         unsigned long iova_pfn;
3430
3431         /*
3432          * Restrict dma_mask to the width that the iommu can handle.
3433          * First-level translation restricts the input-address to a
3434          * canonical address (i.e., address bits 63:N have the same
3435          * value as address bit [N-1], where N is 48-bits with 4-level
3436          * paging and 57-bits with 5-level paging). Hence, skip bit
3437          * [N-1].
3438          */
3439         if (domain_use_first_level(domain))
3440                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3441                                  dma_mask);
3442         else
3443                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3444                                  dma_mask);
3445
3446         /* Ensure we reserve the whole size-aligned region */
3447         nrpages = __roundup_pow_of_two(nrpages);
3448
3449         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3450                 /*
3451                  * First try to allocate an io virtual address in
3452                  * DMA_BIT_MASK(32) and if that fails then try allocating
3453                  * from higher range
3454                  */
3455                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3456                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3457                 if (iova_pfn)
3458                         return iova_pfn;
3459         }
3460         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3461                                    IOVA_PFN(dma_mask), true);
3462         if (unlikely(!iova_pfn)) {
3463                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3464                              nrpages);
3465                 return 0;
3466         }
3467
3468         return iova_pfn;
3469 }
3470
3471 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3472                                      size_t size, int dir, u64 dma_mask)
3473 {
3474         struct dmar_domain *domain;
3475         phys_addr_t start_paddr;
3476         unsigned long iova_pfn;
3477         int prot = 0;
3478         int ret;
3479         struct intel_iommu *iommu;
3480         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3481
3482         BUG_ON(dir == DMA_NONE);
3483
3484         if (unlikely(attach_deferred(dev)))
3485                 do_deferred_attach(dev);
3486
3487         domain = find_domain(dev);
3488         if (!domain)
3489                 return DMA_MAPPING_ERROR;
3490
3491         iommu = domain_get_iommu(domain);
3492         size = aligned_nrpages(paddr, size);
3493
3494         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3495         if (!iova_pfn)
3496                 goto error;
3497
3498         /*
3499          * Check if DMAR supports zero-length reads on write only
3500          * mappings..
3501          */
3502         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3503                         !cap_zlr(iommu->cap))
3504                 prot |= DMA_PTE_READ;
3505         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3506                 prot |= DMA_PTE_WRITE;
3507         /*
3508          * paddr - (paddr + size) might be partial page, we should map the whole
3509          * page.  Note: if two part of one page are separately mapped, we
3510          * might have two guest_addr mapping to the same host paddr, but this
3511          * is not a big problem
3512          */
3513         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3514                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3515         if (ret)
3516                 goto error;
3517
3518         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3519         start_paddr += paddr & ~PAGE_MASK;
3520
3521         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3522
3523         return start_paddr;
3524
3525 error:
3526         if (iova_pfn)
3527                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3528         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3529                 size, (unsigned long long)paddr, dir);
3530         return DMA_MAPPING_ERROR;
3531 }
3532
3533 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3534                                  unsigned long offset, size_t size,
3535                                  enum dma_data_direction dir,
3536                                  unsigned long attrs)
3537 {
3538         return __intel_map_single(dev, page_to_phys(page) + offset,
3539                                   size, dir, *dev->dma_mask);
3540 }
3541
3542 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3543                                      size_t size, enum dma_data_direction dir,
3544                                      unsigned long attrs)
3545 {
3546         return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3547 }
3548
3549 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3550 {
3551         struct dmar_domain *domain;
3552         unsigned long start_pfn, last_pfn;
3553         unsigned long nrpages;
3554         unsigned long iova_pfn;
3555         struct intel_iommu *iommu;
3556         struct page *freelist;
3557         struct pci_dev *pdev = NULL;
3558
3559         domain = find_domain(dev);
3560         BUG_ON(!domain);
3561
3562         iommu = domain_get_iommu(domain);
3563
3564         iova_pfn = IOVA_PFN(dev_addr);
3565
3566         nrpages = aligned_nrpages(dev_addr, size);
3567         start_pfn = mm_to_dma_pfn(iova_pfn);
3568         last_pfn = start_pfn + nrpages - 1;
3569
3570         if (dev_is_pci(dev))
3571                 pdev = to_pci_dev(dev);
3572
3573         freelist = domain_unmap(domain, start_pfn, last_pfn);
3574         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3575                         !has_iova_flush_queue(&domain->iovad)) {
3576                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3577                                       nrpages, !freelist, 0);
3578                 /* free iova */
3579                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3580                 dma_free_pagelist(freelist);
3581         } else {
3582                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3583                            (unsigned long)freelist);
3584                 /*
3585                  * queue up the release of the unmap to save the 1/6th of the
3586                  * cpu used up by the iotlb flush operation...
3587                  */
3588         }
3589
3590         trace_unmap_single(dev, dev_addr, size);
3591 }
3592
3593 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3594                              size_t size, enum dma_data_direction dir,
3595                              unsigned long attrs)
3596 {
3597         intel_unmap(dev, dev_addr, size);
3598 }
3599
3600 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3601                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3602 {
3603         intel_unmap(dev, dev_addr, size);
3604 }
3605
3606 static void *intel_alloc_coherent(struct device *dev, size_t size,
3607                                   dma_addr_t *dma_handle, gfp_t flags,
3608                                   unsigned long attrs)
3609 {
3610         struct page *page = NULL;
3611         int order;
3612
3613         if (unlikely(attach_deferred(dev)))
3614                 do_deferred_attach(dev);
3615
3616         size = PAGE_ALIGN(size);
3617         order = get_order(size);
3618
3619         if (gfpflags_allow_blocking(flags)) {
3620                 unsigned int count = size >> PAGE_SHIFT;
3621
3622                 page = dma_alloc_from_contiguous(dev, count, order,
3623                                                  flags & __GFP_NOWARN);
3624         }
3625
3626         if (!page)
3627                 page = alloc_pages(flags, order);
3628         if (!page)
3629                 return NULL;
3630         memset(page_address(page), 0, size);
3631
3632         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3633                                          DMA_BIDIRECTIONAL,
3634                                          dev->coherent_dma_mask);
3635         if (*dma_handle != DMA_MAPPING_ERROR)
3636                 return page_address(page);
3637         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3638                 __free_pages(page, order);
3639
3640         return NULL;
3641 }
3642
3643 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3644                                 dma_addr_t dma_handle, unsigned long attrs)
3645 {
3646         int order;
3647         struct page *page = virt_to_page(vaddr);
3648
3649         size = PAGE_ALIGN(size);
3650         order = get_order(size);
3651
3652         intel_unmap(dev, dma_handle, size);
3653         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3654                 __free_pages(page, order);
3655 }
3656
3657 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3658                            int nelems, enum dma_data_direction dir,
3659                            unsigned long attrs)
3660 {
3661         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3662         unsigned long nrpages = 0;
3663         struct scatterlist *sg;
3664         int i;
3665
3666         for_each_sg(sglist, sg, nelems, i) {
3667                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3668         }
3669
3670         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3671
3672         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3673 }
3674
3675 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3676                         enum dma_data_direction dir, unsigned long attrs)
3677 {
3678         int i;
3679         struct dmar_domain *domain;
3680         size_t size = 0;
3681         int prot = 0;
3682         unsigned long iova_pfn;
3683         int ret;
3684         struct scatterlist *sg;
3685         unsigned long start_vpfn;
3686         struct intel_iommu *iommu;
3687
3688         BUG_ON(dir == DMA_NONE);
3689
3690         if (unlikely(attach_deferred(dev)))
3691                 do_deferred_attach(dev);
3692
3693         domain = find_domain(dev);
3694         if (!domain)
3695                 return 0;
3696
3697         iommu = domain_get_iommu(domain);
3698
3699         for_each_sg(sglist, sg, nelems, i)
3700                 size += aligned_nrpages(sg->offset, sg->length);
3701
3702         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3703                                 *dev->dma_mask);
3704         if (!iova_pfn) {
3705                 sglist->dma_length = 0;
3706                 return 0;
3707         }
3708
3709         /*
3710          * Check if DMAR supports zero-length reads on write only
3711          * mappings..
3712          */
3713         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3714                         !cap_zlr(iommu->cap))
3715                 prot |= DMA_PTE_READ;
3716         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3717                 prot |= DMA_PTE_WRITE;
3718
3719         start_vpfn = mm_to_dma_pfn(iova_pfn);
3720
3721         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3722         if (unlikely(ret)) {
3723                 dma_pte_free_pagetable(domain, start_vpfn,
3724                                        start_vpfn + size - 1,
3725                                        agaw_to_level(domain->agaw) + 1);
3726                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3727                 return 0;
3728         }
3729
3730         for_each_sg(sglist, sg, nelems, i)
3731                 trace_map_sg(dev, i + 1, nelems, sg);
3732
3733         return nelems;
3734 }
3735
3736 static u64 intel_get_required_mask(struct device *dev)
3737 {
3738         return DMA_BIT_MASK(32);
3739 }
3740
3741 static const struct dma_map_ops intel_dma_ops = {
3742         .alloc = intel_alloc_coherent,
3743         .free = intel_free_coherent,
3744         .map_sg = intel_map_sg,
3745         .unmap_sg = intel_unmap_sg,
3746         .map_page = intel_map_page,
3747         .unmap_page = intel_unmap_page,
3748         .map_resource = intel_map_resource,
3749         .unmap_resource = intel_unmap_resource,
3750         .dma_supported = dma_direct_supported,
3751         .mmap = dma_common_mmap,
3752         .get_sgtable = dma_common_get_sgtable,
3753         .alloc_pages = dma_common_alloc_pages,
3754         .free_pages = dma_common_free_pages,
3755         .get_required_mask = intel_get_required_mask,
3756 };
3757
3758 static void
3759 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3760                    enum dma_data_direction dir, enum dma_sync_target target)
3761 {
3762         struct dmar_domain *domain;
3763         phys_addr_t tlb_addr;
3764
3765         domain = find_domain(dev);
3766         if (WARN_ON(!domain))
3767                 return;
3768
3769         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3770         if (is_swiotlb_buffer(tlb_addr))
3771                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3772 }
3773
3774 static dma_addr_t
3775 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3776                   enum dma_data_direction dir, unsigned long attrs,
3777                   u64 dma_mask)
3778 {
3779         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3780         struct dmar_domain *domain;
3781         struct intel_iommu *iommu;
3782         unsigned long iova_pfn;
3783         unsigned long nrpages;
3784         phys_addr_t tlb_addr;
3785         int prot = 0;
3786         int ret;
3787
3788         if (unlikely(attach_deferred(dev)))
3789                 do_deferred_attach(dev);
3790
3791         domain = find_domain(dev);
3792
3793         if (WARN_ON(dir == DMA_NONE || !domain))
3794                 return DMA_MAPPING_ERROR;
3795
3796         iommu = domain_get_iommu(domain);
3797         if (WARN_ON(!iommu))
3798                 return DMA_MAPPING_ERROR;
3799
3800         nrpages = aligned_nrpages(0, size);
3801         iova_pfn = intel_alloc_iova(dev, domain,
3802                                     dma_to_mm_pfn(nrpages), dma_mask);
3803         if (!iova_pfn)
3804                 return DMA_MAPPING_ERROR;
3805
3806         /*
3807          * Check if DMAR supports zero-length reads on write only
3808          * mappings..
3809          */
3810         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3811                         !cap_zlr(iommu->cap))
3812                 prot |= DMA_PTE_READ;
3813         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3814                 prot |= DMA_PTE_WRITE;
3815
3816         /*
3817          * If both the physical buffer start address and size are
3818          * page aligned, we don't need to use a bounce page.
3819          */
3820         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3821                 tlb_addr = swiotlb_tbl_map_single(dev, paddr, size,
3822                                 aligned_size, dir, attrs);
3823                 if (tlb_addr == DMA_MAPPING_ERROR) {
3824                         goto swiotlb_error;
3825                 } else {
3826                         /* Cleanup the padding area. */
3827                         void *padding_start = phys_to_virt(tlb_addr);
3828                         size_t padding_size = aligned_size;
3829
3830                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3831                             (dir == DMA_TO_DEVICE ||
3832                              dir == DMA_BIDIRECTIONAL)) {
3833                                 padding_start += size;
3834                                 padding_size -= size;
3835                         }
3836
3837                         memset(padding_start, 0, padding_size);
3838                 }
3839         } else {
3840                 tlb_addr = paddr;
3841         }
3842
3843         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3844                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3845         if (ret)
3846                 goto mapping_error;
3847
3848         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3849
3850         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3851
3852 mapping_error:
3853         if (is_swiotlb_buffer(tlb_addr))
3854                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3855                                          aligned_size, dir, attrs);
3856 swiotlb_error:
3857         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3858         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3859                 size, (unsigned long long)paddr, dir);
3860
3861         return DMA_MAPPING_ERROR;
3862 }
3863
3864 static void
3865 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3866                     enum dma_data_direction dir, unsigned long attrs)
3867 {
3868         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3869         struct dmar_domain *domain;
3870         phys_addr_t tlb_addr;
3871
3872         domain = find_domain(dev);
3873         if (WARN_ON(!domain))
3874                 return;
3875
3876         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3877         if (WARN_ON(!tlb_addr))
3878                 return;
3879
3880         intel_unmap(dev, dev_addr, size);
3881         if (is_swiotlb_buffer(tlb_addr))
3882                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3883                                          aligned_size, dir, attrs);
3884
3885         trace_bounce_unmap_single(dev, dev_addr, size);
3886 }
3887
3888 static dma_addr_t
3889 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3890                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3891 {
3892         return bounce_map_single(dev, page_to_phys(page) + offset,
3893                                  size, dir, attrs, *dev->dma_mask);
3894 }
3895
3896 static dma_addr_t
3897 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3898                     enum dma_data_direction dir, unsigned long attrs)
3899 {
3900         return bounce_map_single(dev, phys_addr, size,
3901                                  dir, attrs, *dev->dma_mask);
3902 }
3903
3904 static void
3905 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3906                   enum dma_data_direction dir, unsigned long attrs)
3907 {
3908         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3909 }
3910
3911 static void
3912 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3913                       enum dma_data_direction dir, unsigned long attrs)
3914 {
3915         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3916 }
3917
3918 static void
3919 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3920                 enum dma_data_direction dir, unsigned long attrs)
3921 {
3922         struct scatterlist *sg;
3923         int i;
3924
3925         for_each_sg(sglist, sg, nelems, i)
3926                 bounce_unmap_page(dev, sg->dma_address,
3927                                   sg_dma_len(sg), dir, attrs);
3928 }
3929
3930 static int
3931 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3932               enum dma_data_direction dir, unsigned long attrs)
3933 {
3934         int i;
3935         struct scatterlist *sg;
3936
3937         for_each_sg(sglist, sg, nelems, i) {
3938                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3939                                                   sg->offset, sg->length,
3940                                                   dir, attrs);
3941                 if (sg->dma_address == DMA_MAPPING_ERROR)
3942                         goto out_unmap;
3943                 sg_dma_len(sg) = sg->length;
3944         }
3945
3946         for_each_sg(sglist, sg, nelems, i)
3947                 trace_bounce_map_sg(dev, i + 1, nelems, sg);
3948
3949         return nelems;
3950
3951 out_unmap:
3952         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3953         return 0;
3954 }
3955
3956 static void
3957 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3958                            size_t size, enum dma_data_direction dir)
3959 {
3960         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3961 }
3962
3963 static void
3964 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3965                               size_t size, enum dma_data_direction dir)
3966 {
3967         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3968 }
3969
3970 static void
3971 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3972                        int nelems, enum dma_data_direction dir)
3973 {
3974         struct scatterlist *sg;
3975         int i;
3976
3977         for_each_sg(sglist, sg, nelems, i)
3978                 bounce_sync_single(dev, sg_dma_address(sg),
3979                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
3980 }
3981
3982 static void
3983 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3984                           int nelems, enum dma_data_direction dir)
3985 {
3986         struct scatterlist *sg;
3987         int i;
3988
3989         for_each_sg(sglist, sg, nelems, i)
3990                 bounce_sync_single(dev, sg_dma_address(sg),
3991                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3992 }
3993
3994 static const struct dma_map_ops bounce_dma_ops = {
3995         .alloc                  = intel_alloc_coherent,
3996         .free                   = intel_free_coherent,
3997         .map_sg                 = bounce_map_sg,
3998         .unmap_sg               = bounce_unmap_sg,
3999         .map_page               = bounce_map_page,
4000         .unmap_page             = bounce_unmap_page,
4001         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
4002         .sync_single_for_device = bounce_sync_single_for_device,
4003         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
4004         .sync_sg_for_device     = bounce_sync_sg_for_device,
4005         .map_resource           = bounce_map_resource,
4006         .unmap_resource         = bounce_unmap_resource,
4007         .alloc_pages            = dma_common_alloc_pages,
4008         .free_pages             = dma_common_free_pages,
4009         .dma_supported          = dma_direct_supported,
4010 };
4011
4012 static inline int iommu_domain_cache_init(void)
4013 {
4014         int ret = 0;
4015
4016         iommu_domain_cache = kmem_cache_create("iommu_domain",
4017                                          sizeof(struct dmar_domain),
4018                                          0,
4019                                          SLAB_HWCACHE_ALIGN,
4020
4021                                          NULL);
4022         if (!iommu_domain_cache) {
4023                 pr_err("Couldn't create iommu_domain cache\n");
4024                 ret = -ENOMEM;
4025         }
4026
4027         return ret;
4028 }
4029
4030 static inline int iommu_devinfo_cache_init(void)
4031 {
4032         int ret = 0;
4033
4034         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4035                                          sizeof(struct device_domain_info),
4036                                          0,
4037                                          SLAB_HWCACHE_ALIGN,
4038                                          NULL);
4039         if (!iommu_devinfo_cache) {
4040                 pr_err("Couldn't create devinfo cache\n");
4041                 ret = -ENOMEM;
4042         }
4043
4044         return ret;
4045 }
4046
4047 static int __init iommu_init_mempool(void)
4048 {
4049         int ret;
4050         ret = iova_cache_get();
4051         if (ret)
4052                 return ret;
4053
4054         ret = iommu_domain_cache_init();
4055         if (ret)
4056                 goto domain_error;
4057
4058         ret = iommu_devinfo_cache_init();
4059         if (!ret)
4060                 return ret;
4061
4062         kmem_cache_destroy(iommu_domain_cache);
4063 domain_error:
4064         iova_cache_put();
4065
4066         return -ENOMEM;
4067 }
4068
4069 static void __init iommu_exit_mempool(void)
4070 {
4071         kmem_cache_destroy(iommu_devinfo_cache);
4072         kmem_cache_destroy(iommu_domain_cache);
4073         iova_cache_put();
4074 }
4075
4076 static void __init init_no_remapping_devices(void)
4077 {
4078         struct dmar_drhd_unit *drhd;
4079         struct device *dev;
4080         int i;
4081
4082         for_each_drhd_unit(drhd) {
4083                 if (!drhd->include_all) {
4084                         for_each_active_dev_scope(drhd->devices,
4085                                                   drhd->devices_cnt, i, dev)
4086                                 break;
4087                         /* ignore DMAR unit if no devices exist */
4088                         if (i == drhd->devices_cnt)
4089                                 drhd->ignored = 1;
4090                 }
4091         }
4092
4093         for_each_active_drhd_unit(drhd) {
4094                 if (drhd->include_all)
4095                         continue;
4096
4097                 for_each_active_dev_scope(drhd->devices,
4098                                           drhd->devices_cnt, i, dev)
4099                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4100                                 break;
4101                 if (i < drhd->devices_cnt)
4102                         continue;
4103
4104                 /* This IOMMU has *only* gfx devices. Either bypass it or
4105                    set the gfx_mapped flag, as appropriate */
4106                 drhd->gfx_dedicated = 1;
4107                 if (!dmar_map_gfx)
4108                         drhd->ignored = 1;
4109         }
4110 }
4111
4112 #ifdef CONFIG_SUSPEND
4113 static int init_iommu_hw(void)
4114 {
4115         struct dmar_drhd_unit *drhd;
4116         struct intel_iommu *iommu = NULL;
4117
4118         for_each_active_iommu(iommu, drhd)
4119                 if (iommu->qi)
4120                         dmar_reenable_qi(iommu);
4121
4122         for_each_iommu(iommu, drhd) {
4123                 if (drhd->ignored) {
4124                         /*
4125                          * we always have to disable PMRs or DMA may fail on
4126                          * this device
4127                          */
4128                         if (force_on)
4129                                 iommu_disable_protect_mem_regions(iommu);
4130                         continue;
4131                 }
4132
4133                 iommu_flush_write_buffer(iommu);
4134
4135                 iommu_set_root_entry(iommu);
4136
4137                 iommu->flush.flush_context(iommu, 0, 0, 0,
4138                                            DMA_CCMD_GLOBAL_INVL);
4139                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4140                 iommu_enable_translation(iommu);
4141                 iommu_disable_protect_mem_regions(iommu);
4142         }
4143
4144         return 0;
4145 }
4146
4147 static void iommu_flush_all(void)
4148 {
4149         struct dmar_drhd_unit *drhd;
4150         struct intel_iommu *iommu;
4151
4152         for_each_active_iommu(iommu, drhd) {
4153                 iommu->flush.flush_context(iommu, 0, 0, 0,
4154                                            DMA_CCMD_GLOBAL_INVL);
4155                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4156                                          DMA_TLB_GLOBAL_FLUSH);
4157         }
4158 }
4159
4160 static int iommu_suspend(void)
4161 {
4162         struct dmar_drhd_unit *drhd;
4163         struct intel_iommu *iommu = NULL;
4164         unsigned long flag;
4165
4166         for_each_active_iommu(iommu, drhd) {
4167                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4168                                                  GFP_ATOMIC);
4169                 if (!iommu->iommu_state)
4170                         goto nomem;
4171         }
4172
4173         iommu_flush_all();
4174
4175         for_each_active_iommu(iommu, drhd) {
4176                 iommu_disable_translation(iommu);
4177
4178                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4179
4180                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4181                         readl(iommu->reg + DMAR_FECTL_REG);
4182                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4183                         readl(iommu->reg + DMAR_FEDATA_REG);
4184                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4185                         readl(iommu->reg + DMAR_FEADDR_REG);
4186                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4187                         readl(iommu->reg + DMAR_FEUADDR_REG);
4188
4189                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4190         }
4191         return 0;
4192
4193 nomem:
4194         for_each_active_iommu(iommu, drhd)
4195                 kfree(iommu->iommu_state);
4196
4197         return -ENOMEM;
4198 }
4199
4200 static void iommu_resume(void)
4201 {
4202         struct dmar_drhd_unit *drhd;
4203         struct intel_iommu *iommu = NULL;
4204         unsigned long flag;
4205
4206         if (init_iommu_hw()) {
4207                 if (force_on)
4208                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4209                 else
4210                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4211                 return;
4212         }
4213
4214         for_each_active_iommu(iommu, drhd) {
4215
4216                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4217
4218                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4219                         iommu->reg + DMAR_FECTL_REG);
4220                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4221                         iommu->reg + DMAR_FEDATA_REG);
4222                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4223                         iommu->reg + DMAR_FEADDR_REG);
4224                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4225                         iommu->reg + DMAR_FEUADDR_REG);
4226
4227                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4228         }
4229
4230         for_each_active_iommu(iommu, drhd)
4231                 kfree(iommu->iommu_state);
4232 }
4233
4234 static struct syscore_ops iommu_syscore_ops = {
4235         .resume         = iommu_resume,
4236         .suspend        = iommu_suspend,
4237 };
4238
4239 static void __init init_iommu_pm_ops(void)
4240 {
4241         register_syscore_ops(&iommu_syscore_ops);
4242 }
4243
4244 #else
4245 static inline void init_iommu_pm_ops(void) {}
4246 #endif  /* CONFIG_PM */
4247
4248 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4249 {
4250         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4251             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4252             rmrr->end_address <= rmrr->base_address ||
4253             arch_rmrr_sanity_check(rmrr))
4254                 return -EINVAL;
4255
4256         return 0;
4257 }
4258
4259 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4260 {
4261         struct acpi_dmar_reserved_memory *rmrr;
4262         struct dmar_rmrr_unit *rmrru;
4263
4264         rmrr = (struct acpi_dmar_reserved_memory *)header;
4265         if (rmrr_sanity_check(rmrr)) {
4266                 pr_warn(FW_BUG
4267                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4268                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4269                            rmrr->base_address, rmrr->end_address,
4270                            dmi_get_system_info(DMI_BIOS_VENDOR),
4271                            dmi_get_system_info(DMI_BIOS_VERSION),
4272                            dmi_get_system_info(DMI_PRODUCT_VERSION));
4273                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4274         }
4275
4276         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4277         if (!rmrru)
4278                 goto out;
4279
4280         rmrru->hdr = header;
4281
4282         rmrru->base_address = rmrr->base_address;
4283         rmrru->end_address = rmrr->end_address;
4284
4285         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4286                                 ((void *)rmrr) + rmrr->header.length,
4287                                 &rmrru->devices_cnt);
4288         if (rmrru->devices_cnt && rmrru->devices == NULL)
4289                 goto free_rmrru;
4290
4291         list_add(&rmrru->list, &dmar_rmrr_units);
4292
4293         return 0;
4294 free_rmrru:
4295         kfree(rmrru);
4296 out:
4297         return -ENOMEM;
4298 }
4299
4300 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4301 {
4302         struct dmar_atsr_unit *atsru;
4303         struct acpi_dmar_atsr *tmp;
4304
4305         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4306                                 dmar_rcu_check()) {
4307                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4308                 if (atsr->segment != tmp->segment)
4309                         continue;
4310                 if (atsr->header.length != tmp->header.length)
4311                         continue;
4312                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4313                         return atsru;
4314         }
4315
4316         return NULL;
4317 }
4318
4319 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4320 {
4321         struct acpi_dmar_atsr *atsr;
4322         struct dmar_atsr_unit *atsru;
4323
4324         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4325                 return 0;
4326
4327         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4328         atsru = dmar_find_atsr(atsr);
4329         if (atsru)
4330                 return 0;
4331
4332         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4333         if (!atsru)
4334                 return -ENOMEM;
4335
4336         /*
4337          * If memory is allocated from slab by ACPI _DSM method, we need to
4338          * copy the memory content because the memory buffer will be freed
4339          * on return.
4340          */
4341         atsru->hdr = (void *)(atsru + 1);
4342         memcpy(atsru->hdr, hdr, hdr->length);
4343         atsru->include_all = atsr->flags & 0x1;
4344         if (!atsru->include_all) {
4345                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4346                                 (void *)atsr + atsr->header.length,
4347                                 &atsru->devices_cnt);
4348                 if (atsru->devices_cnt && atsru->devices == NULL) {
4349                         kfree(atsru);
4350                         return -ENOMEM;
4351                 }
4352         }
4353
4354         list_add_rcu(&atsru->list, &dmar_atsr_units);
4355
4356         return 0;
4357 }
4358
4359 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4360 {
4361         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4362         kfree(atsru);
4363 }
4364
4365 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4366 {
4367         struct acpi_dmar_atsr *atsr;
4368         struct dmar_atsr_unit *atsru;
4369
4370         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4371         atsru = dmar_find_atsr(atsr);
4372         if (atsru) {
4373                 list_del_rcu(&atsru->list);
4374                 synchronize_rcu();
4375                 intel_iommu_free_atsr(atsru);
4376         }
4377
4378         return 0;
4379 }
4380
4381 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4382 {
4383         int i;
4384         struct device *dev;
4385         struct acpi_dmar_atsr *atsr;
4386         struct dmar_atsr_unit *atsru;
4387
4388         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4389         atsru = dmar_find_atsr(atsr);
4390         if (!atsru)
4391                 return 0;
4392
4393         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4394                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4395                                           i, dev)
4396                         return -EBUSY;
4397         }
4398
4399         return 0;
4400 }
4401
4402 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4403 {
4404         int sp, ret;
4405         struct intel_iommu *iommu = dmaru->iommu;
4406
4407         if (g_iommus[iommu->seq_id])
4408                 return 0;
4409
4410         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4411                 pr_warn("%s: Doesn't support hardware pass through.\n",
4412                         iommu->name);
4413                 return -ENXIO;
4414         }
4415         if (!ecap_sc_support(iommu->ecap) &&
4416             domain_update_iommu_snooping(iommu)) {
4417                 pr_warn("%s: Doesn't support snooping.\n",
4418                         iommu->name);
4419                 return -ENXIO;
4420         }
4421         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4422         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4423                 pr_warn("%s: Doesn't support large page.\n",
4424                         iommu->name);
4425                 return -ENXIO;
4426         }
4427
4428         /*
4429          * Disable translation if already enabled prior to OS handover.
4430          */
4431         if (iommu->gcmd & DMA_GCMD_TE)
4432                 iommu_disable_translation(iommu);
4433
4434         g_iommus[iommu->seq_id] = iommu;
4435         ret = iommu_init_domains(iommu);
4436         if (ret == 0)
4437                 ret = iommu_alloc_root_entry(iommu);
4438         if (ret)
4439                 goto out;
4440
4441         intel_svm_check(iommu);
4442
4443         if (dmaru->ignored) {
4444                 /*
4445                  * we always have to disable PMRs or DMA may fail on this device
4446                  */
4447                 if (force_on)
4448                         iommu_disable_protect_mem_regions(iommu);
4449                 return 0;
4450         }
4451
4452         intel_iommu_init_qi(iommu);
4453         iommu_flush_write_buffer(iommu);
4454
4455 #ifdef CONFIG_INTEL_IOMMU_SVM
4456         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4457                 ret = intel_svm_enable_prq(iommu);
4458                 if (ret)
4459                         goto disable_iommu;
4460         }
4461 #endif
4462         ret = dmar_set_interrupt(iommu);
4463         if (ret)
4464                 goto disable_iommu;
4465
4466         iommu_set_root_entry(iommu);
4467         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4468         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4469         iommu_enable_translation(iommu);
4470
4471         iommu_disable_protect_mem_regions(iommu);
4472         return 0;
4473
4474 disable_iommu:
4475         disable_dmar_iommu(iommu);
4476 out:
4477         free_dmar_iommu(iommu);
4478         return ret;
4479 }
4480
4481 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4482 {
4483         int ret = 0;
4484         struct intel_iommu *iommu = dmaru->iommu;
4485
4486         if (!intel_iommu_enabled)
4487                 return 0;
4488         if (iommu == NULL)
4489                 return -EINVAL;
4490
4491         if (insert) {
4492                 ret = intel_iommu_add(dmaru);
4493         } else {
4494                 disable_dmar_iommu(iommu);
4495                 free_dmar_iommu(iommu);
4496         }
4497
4498         return ret;
4499 }
4500
4501 static void intel_iommu_free_dmars(void)
4502 {
4503         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4504         struct dmar_atsr_unit *atsru, *atsr_n;
4505
4506         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4507                 list_del(&rmrru->list);
4508                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4509                 kfree(rmrru);
4510         }
4511
4512         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4513                 list_del(&atsru->list);
4514                 intel_iommu_free_atsr(atsru);
4515         }
4516 }
4517
4518 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4519 {
4520         int i, ret = 1;
4521         struct pci_bus *bus;
4522         struct pci_dev *bridge = NULL;
4523         struct device *tmp;
4524         struct acpi_dmar_atsr *atsr;
4525         struct dmar_atsr_unit *atsru;
4526
4527         dev = pci_physfn(dev);
4528         for (bus = dev->bus; bus; bus = bus->parent) {
4529                 bridge = bus->self;
4530                 /* If it's an integrated device, allow ATS */
4531                 if (!bridge)
4532                         return 1;
4533                 /* Connected via non-PCIe: no ATS */
4534                 if (!pci_is_pcie(bridge) ||
4535                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4536                         return 0;
4537                 /* If we found the root port, look it up in the ATSR */
4538                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4539                         break;
4540         }
4541
4542         rcu_read_lock();
4543         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4544                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4545                 if (atsr->segment != pci_domain_nr(dev->bus))
4546                         continue;
4547
4548                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4549                         if (tmp == &bridge->dev)
4550                                 goto out;
4551
4552                 if (atsru->include_all)
4553                         goto out;
4554         }
4555         ret = 0;
4556 out:
4557         rcu_read_unlock();
4558
4559         return ret;
4560 }
4561
4562 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4563 {
4564         int ret;
4565         struct dmar_rmrr_unit *rmrru;
4566         struct dmar_atsr_unit *atsru;
4567         struct acpi_dmar_atsr *atsr;
4568         struct acpi_dmar_reserved_memory *rmrr;
4569
4570         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4571                 return 0;
4572
4573         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4574                 rmrr = container_of(rmrru->hdr,
4575                                     struct acpi_dmar_reserved_memory, header);
4576                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4577                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4578                                 ((void *)rmrr) + rmrr->header.length,
4579                                 rmrr->segment, rmrru->devices,
4580                                 rmrru->devices_cnt);
4581                         if (ret < 0)
4582                                 return ret;
4583                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4584                         dmar_remove_dev_scope(info, rmrr->segment,
4585                                 rmrru->devices, rmrru->devices_cnt);
4586                 }
4587         }
4588
4589         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4590                 if (atsru->include_all)
4591                         continue;
4592
4593                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4594                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4595                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4596                                         (void *)atsr + atsr->header.length,
4597                                         atsr->segment, atsru->devices,
4598                                         atsru->devices_cnt);
4599                         if (ret > 0)
4600                                 break;
4601                         else if (ret < 0)
4602                                 return ret;
4603                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4604                         if (dmar_remove_dev_scope(info, atsr->segment,
4605                                         atsru->devices, atsru->devices_cnt))
4606                                 break;
4607                 }
4608         }
4609
4610         return 0;
4611 }
4612
4613 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4614                                        unsigned long val, void *v)
4615 {
4616         struct memory_notify *mhp = v;
4617         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4618         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4619                         mhp->nr_pages - 1);
4620
4621         switch (val) {
4622         case MEM_GOING_ONLINE:
4623                 if (iommu_domain_identity_map(si_domain,
4624                                               start_vpfn, last_vpfn)) {
4625                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
4626                                 start_vpfn, last_vpfn);
4627                         return NOTIFY_BAD;
4628                 }
4629                 break;
4630
4631         case MEM_OFFLINE:
4632         case MEM_CANCEL_ONLINE:
4633                 {
4634                         struct dmar_drhd_unit *drhd;
4635                         struct intel_iommu *iommu;
4636                         struct page *freelist;
4637
4638                         freelist = domain_unmap(si_domain,
4639                                                 start_vpfn, last_vpfn);
4640
4641                         rcu_read_lock();
4642                         for_each_active_iommu(iommu, drhd)
4643                                 iommu_flush_iotlb_psi(iommu, si_domain,
4644                                         start_vpfn, mhp->nr_pages,
4645                                         !freelist, 0);
4646                         rcu_read_unlock();
4647                         dma_free_pagelist(freelist);
4648                 }
4649                 break;
4650         }
4651
4652         return NOTIFY_OK;
4653 }
4654
4655 static struct notifier_block intel_iommu_memory_nb = {
4656         .notifier_call = intel_iommu_memory_notifier,
4657         .priority = 0
4658 };
4659
4660 static void free_all_cpu_cached_iovas(unsigned int cpu)
4661 {
4662         int i;
4663
4664         for (i = 0; i < g_num_of_iommus; i++) {
4665                 struct intel_iommu *iommu = g_iommus[i];
4666                 struct dmar_domain *domain;
4667                 int did;
4668
4669                 if (!iommu)
4670                         continue;
4671
4672                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4673                         domain = get_iommu_domain(iommu, (u16)did);
4674
4675                         if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4676                                 continue;
4677
4678                         free_cpu_cached_iovas(cpu, &domain->iovad);
4679                 }
4680         }
4681 }
4682
4683 static int intel_iommu_cpu_dead(unsigned int cpu)
4684 {
4685         free_all_cpu_cached_iovas(cpu);
4686         return 0;
4687 }
4688
4689 static void intel_disable_iommus(void)
4690 {
4691         struct intel_iommu *iommu = NULL;
4692         struct dmar_drhd_unit *drhd;
4693
4694         for_each_iommu(iommu, drhd)
4695                 iommu_disable_translation(iommu);
4696 }
4697
4698 void intel_iommu_shutdown(void)
4699 {
4700         struct dmar_drhd_unit *drhd;
4701         struct intel_iommu *iommu = NULL;
4702
4703         if (no_iommu || dmar_disabled)
4704                 return;
4705
4706         down_write(&dmar_global_lock);
4707
4708         /* Disable PMRs explicitly here. */
4709         for_each_iommu(iommu, drhd)
4710                 iommu_disable_protect_mem_regions(iommu);
4711
4712         /* Make sure the IOMMUs are switched off */
4713         intel_disable_iommus();
4714
4715         up_write(&dmar_global_lock);
4716 }
4717
4718 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4719 {
4720         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4721
4722         return container_of(iommu_dev, struct intel_iommu, iommu);
4723 }
4724
4725 static ssize_t intel_iommu_show_version(struct device *dev,
4726                                         struct device_attribute *attr,
4727                                         char *buf)
4728 {
4729         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4730         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4731         return sprintf(buf, "%d:%d\n",
4732                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4733 }
4734 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4735
4736 static ssize_t intel_iommu_show_address(struct device *dev,
4737                                         struct device_attribute *attr,
4738                                         char *buf)
4739 {
4740         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4741         return sprintf(buf, "%llx\n", iommu->reg_phys);
4742 }
4743 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4744
4745 static ssize_t intel_iommu_show_cap(struct device *dev,
4746                                     struct device_attribute *attr,
4747                                     char *buf)
4748 {
4749         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4750         return sprintf(buf, "%llx\n", iommu->cap);
4751 }
4752 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4753
4754 static ssize_t intel_iommu_show_ecap(struct device *dev,
4755                                     struct device_attribute *attr,
4756                                     char *buf)
4757 {
4758         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4759         return sprintf(buf, "%llx\n", iommu->ecap);
4760 }
4761 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4762
4763 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4764                                       struct device_attribute *attr,
4765                                       char *buf)
4766 {
4767         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4768         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4769 }
4770 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4771
4772 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4773                                            struct device_attribute *attr,
4774                                            char *buf)
4775 {
4776         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4777         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4778                                                   cap_ndoms(iommu->cap)));
4779 }
4780 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4781
4782 static struct attribute *intel_iommu_attrs[] = {
4783         &dev_attr_version.attr,
4784         &dev_attr_address.attr,
4785         &dev_attr_cap.attr,
4786         &dev_attr_ecap.attr,
4787         &dev_attr_domains_supported.attr,
4788         &dev_attr_domains_used.attr,
4789         NULL,
4790 };
4791
4792 static struct attribute_group intel_iommu_group = {
4793         .name = "intel-iommu",
4794         .attrs = intel_iommu_attrs,
4795 };
4796
4797 const struct attribute_group *intel_iommu_groups[] = {
4798         &intel_iommu_group,
4799         NULL,
4800 };
4801
4802 static inline bool has_external_pci(void)
4803 {
4804         struct pci_dev *pdev = NULL;
4805
4806         for_each_pci_dev(pdev)
4807                 if (pdev->external_facing)
4808                         return true;
4809
4810         return false;
4811 }
4812
4813 static int __init platform_optin_force_iommu(void)
4814 {
4815         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4816                 return 0;
4817
4818         if (no_iommu || dmar_disabled)
4819                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4820
4821         /*
4822          * If Intel-IOMMU is disabled by default, we will apply identity
4823          * map for all devices except those marked as being untrusted.
4824          */
4825         if (dmar_disabled)
4826                 iommu_set_default_passthrough(false);
4827
4828         dmar_disabled = 0;
4829         no_iommu = 0;
4830
4831         return 1;
4832 }
4833
4834 static int __init probe_acpi_namespace_devices(void)
4835 {
4836         struct dmar_drhd_unit *drhd;
4837         /* To avoid a -Wunused-but-set-variable warning. */
4838         struct intel_iommu *iommu __maybe_unused;
4839         struct device *dev;
4840         int i, ret = 0;
4841
4842         for_each_active_iommu(iommu, drhd) {
4843                 for_each_active_dev_scope(drhd->devices,
4844                                           drhd->devices_cnt, i, dev) {
4845                         struct acpi_device_physical_node *pn;
4846                         struct iommu_group *group;
4847                         struct acpi_device *adev;
4848
4849                         if (dev->bus != &acpi_bus_type)
4850                                 continue;
4851
4852                         adev = to_acpi_device(dev);
4853                         mutex_lock(&adev->physical_node_lock);
4854                         list_for_each_entry(pn,
4855                                             &adev->physical_node_list, node) {
4856                                 group = iommu_group_get(pn->dev);
4857                                 if (group) {
4858                                         iommu_group_put(group);
4859                                         continue;
4860                                 }
4861
4862                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4863                                 ret = iommu_probe_device(pn->dev);
4864                                 if (ret)
4865                                         break;
4866                         }
4867                         mutex_unlock(&adev->physical_node_lock);
4868
4869                         if (ret)
4870                                 return ret;
4871                 }
4872         }
4873
4874         return 0;
4875 }
4876
4877 int __init intel_iommu_init(void)
4878 {
4879         int ret = -ENODEV;
4880         struct dmar_drhd_unit *drhd;
4881         struct intel_iommu *iommu;
4882
4883         /*
4884          * Intel IOMMU is required for a TXT/tboot launch or platform
4885          * opt in, so enforce that.
4886          */
4887         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4888
4889         if (iommu_init_mempool()) {
4890                 if (force_on)
4891                         panic("tboot: Failed to initialize iommu memory\n");
4892                 return -ENOMEM;
4893         }
4894
4895         down_write(&dmar_global_lock);
4896         if (dmar_table_init()) {
4897                 if (force_on)
4898                         panic("tboot: Failed to initialize DMAR table\n");
4899                 goto out_free_dmar;
4900         }
4901
4902         if (dmar_dev_scope_init() < 0) {
4903                 if (force_on)
4904                         panic("tboot: Failed to initialize DMAR device scope\n");
4905                 goto out_free_dmar;
4906         }
4907
4908         up_write(&dmar_global_lock);
4909
4910         /*
4911          * The bus notifier takes the dmar_global_lock, so lockdep will
4912          * complain later when we register it under the lock.
4913          */
4914         dmar_register_bus_notifier();
4915
4916         down_write(&dmar_global_lock);
4917
4918         if (!no_iommu)
4919                 intel_iommu_debugfs_init();
4920
4921         if (no_iommu || dmar_disabled) {
4922                 /*
4923                  * We exit the function here to ensure IOMMU's remapping and
4924                  * mempool aren't setup, which means that the IOMMU's PMRs
4925                  * won't be disabled via the call to init_dmars(). So disable
4926                  * it explicitly here. The PMRs were setup by tboot prior to
4927                  * calling SENTER, but the kernel is expected to reset/tear
4928                  * down the PMRs.
4929                  */
4930                 if (intel_iommu_tboot_noforce) {
4931                         for_each_iommu(iommu, drhd)
4932                                 iommu_disable_protect_mem_regions(iommu);
4933                 }
4934
4935                 /*
4936                  * Make sure the IOMMUs are switched off, even when we
4937                  * boot into a kexec kernel and the previous kernel left
4938                  * them enabled
4939                  */
4940                 intel_disable_iommus();
4941                 goto out_free_dmar;
4942         }
4943
4944         if (list_empty(&dmar_rmrr_units))
4945                 pr_info("No RMRR found\n");
4946
4947         if (list_empty(&dmar_atsr_units))
4948                 pr_info("No ATSR found\n");
4949
4950         if (dmar_init_reserved_ranges()) {
4951                 if (force_on)
4952                         panic("tboot: Failed to reserve iommu ranges\n");
4953                 goto out_free_reserved_range;
4954         }
4955
4956         if (dmar_map_gfx)
4957                 intel_iommu_gfx_mapped = 1;
4958
4959         init_no_remapping_devices();
4960
4961         ret = init_dmars();
4962         if (ret) {
4963                 if (force_on)
4964                         panic("tboot: Failed to initialize DMARs\n");
4965                 pr_err("Initialization failed\n");
4966                 goto out_free_reserved_range;
4967         }
4968         up_write(&dmar_global_lock);
4969
4970         init_iommu_pm_ops();
4971
4972         down_read(&dmar_global_lock);
4973         for_each_active_iommu(iommu, drhd) {
4974                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4975                                        intel_iommu_groups,
4976                                        "%s", iommu->name);
4977                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4978                 iommu_device_register(&iommu->iommu);
4979         }
4980         up_read(&dmar_global_lock);
4981
4982         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4983         if (si_domain && !hw_pass_through)
4984                 register_memory_notifier(&intel_iommu_memory_nb);
4985         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4986                           intel_iommu_cpu_dead);
4987
4988         down_read(&dmar_global_lock);
4989         if (probe_acpi_namespace_devices())
4990                 pr_warn("ACPI name space devices didn't probe correctly\n");
4991
4992         /* Finally, we enable the DMA remapping hardware. */
4993         for_each_iommu(iommu, drhd) {
4994                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4995                         iommu_enable_translation(iommu);
4996
4997                 iommu_disable_protect_mem_regions(iommu);
4998         }
4999         up_read(&dmar_global_lock);
5000
5001         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5002
5003         intel_iommu_enabled = 1;
5004
5005         return 0;
5006
5007 out_free_reserved_range:
5008         put_iova_domain(&reserved_iova_list);
5009 out_free_dmar:
5010         intel_iommu_free_dmars();
5011         up_write(&dmar_global_lock);
5012         iommu_exit_mempool();
5013         return ret;
5014 }
5015
5016 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5017 {
5018         struct intel_iommu *iommu = opaque;
5019
5020         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5021         return 0;
5022 }
5023
5024 /*
5025  * NB - intel-iommu lacks any sort of reference counting for the users of
5026  * dependent devices.  If multiple endpoints have intersecting dependent
5027  * devices, unbinding the driver from any one of them will possibly leave
5028  * the others unable to operate.
5029  */
5030 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5031 {
5032         if (!iommu || !dev || !dev_is_pci(dev))
5033                 return;
5034
5035         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5036 }
5037
5038 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5039 {
5040         struct dmar_domain *domain;
5041         struct intel_iommu *iommu;
5042         unsigned long flags;
5043
5044         assert_spin_locked(&device_domain_lock);
5045
5046         if (WARN_ON(!info))
5047                 return;
5048
5049         iommu = info->iommu;
5050         domain = info->domain;
5051
5052         if (info->dev) {
5053                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5054                         intel_pasid_tear_down_entry(iommu, info->dev,
5055                                         PASID_RID2PASID, false);
5056
5057                 iommu_disable_dev_iotlb(info);
5058                 if (!dev_is_real_dma_subdevice(info->dev))
5059                         domain_context_clear(iommu, info->dev);
5060                 intel_pasid_free_table(info->dev);
5061         }
5062
5063         unlink_domain_info(info);
5064
5065         spin_lock_irqsave(&iommu->lock, flags);
5066         domain_detach_iommu(domain, iommu);
5067         spin_unlock_irqrestore(&iommu->lock, flags);
5068
5069         free_devinfo_mem(info);
5070 }
5071
5072 static void dmar_remove_one_dev_info(struct device *dev)
5073 {
5074         struct device_domain_info *info;
5075         unsigned long flags;
5076
5077         spin_lock_irqsave(&device_domain_lock, flags);
5078         info = get_domain_info(dev);
5079         if (info)
5080                 __dmar_remove_one_dev_info(info);
5081         spin_unlock_irqrestore(&device_domain_lock, flags);
5082 }
5083
5084 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5085 {
5086         int adjust_width;
5087
5088         /* calculate AGAW */
5089         domain->gaw = guest_width;
5090         adjust_width = guestwidth_to_adjustwidth(guest_width);
5091         domain->agaw = width_to_agaw(adjust_width);
5092
5093         domain->iommu_coherency = 0;
5094         domain->iommu_snooping = 0;
5095         domain->iommu_superpage = 0;
5096         domain->max_addr = 0;
5097
5098         /* always allocate the top pgd */
5099         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5100         if (!domain->pgd)
5101                 return -ENOMEM;
5102         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5103         return 0;
5104 }
5105
5106 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5107 {
5108         init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5109         copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5110
5111         if (!intel_iommu_strict &&
5112             init_iova_flush_queue(&dmar_domain->iovad,
5113                                   iommu_flush_iova, iova_entry_free))
5114                 pr_info("iova flush queue initialization failed\n");
5115 }
5116
5117 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5118 {
5119         struct dmar_domain *dmar_domain;
5120         struct iommu_domain *domain;
5121
5122         switch (type) {
5123         case IOMMU_DOMAIN_DMA:
5124         case IOMMU_DOMAIN_UNMANAGED:
5125                 dmar_domain = alloc_domain(0);
5126                 if (!dmar_domain) {
5127                         pr_err("Can't allocate dmar_domain\n");
5128                         return NULL;
5129                 }
5130                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5131                         pr_err("Domain initialization failed\n");
5132                         domain_exit(dmar_domain);
5133                         return NULL;
5134                 }
5135
5136                 if (type == IOMMU_DOMAIN_DMA)
5137                         intel_init_iova_domain(dmar_domain);
5138
5139                 domain = &dmar_domain->domain;
5140                 domain->geometry.aperture_start = 0;
5141                 domain->geometry.aperture_end   =
5142                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5143                 domain->geometry.force_aperture = true;
5144
5145                 return domain;
5146         case IOMMU_DOMAIN_IDENTITY:
5147                 return &si_domain->domain;
5148         default:
5149                 return NULL;
5150         }
5151
5152         return NULL;
5153 }
5154
5155 static void intel_iommu_domain_free(struct iommu_domain *domain)
5156 {
5157         if (domain != &si_domain->domain)
5158                 domain_exit(to_dmar_domain(domain));
5159 }
5160
5161 /*
5162  * Check whether a @domain could be attached to the @dev through the
5163  * aux-domain attach/detach APIs.
5164  */
5165 static inline bool
5166 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5167 {
5168         struct device_domain_info *info = get_domain_info(dev);
5169
5170         return info && info->auxd_enabled &&
5171                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5172 }
5173
5174 static void auxiliary_link_device(struct dmar_domain *domain,
5175                                   struct device *dev)
5176 {
5177         struct device_domain_info *info = get_domain_info(dev);
5178
5179         assert_spin_locked(&device_domain_lock);
5180         if (WARN_ON(!info))
5181                 return;
5182
5183         domain->auxd_refcnt++;
5184         list_add(&domain->auxd, &info->auxiliary_domains);
5185 }
5186
5187 static void auxiliary_unlink_device(struct dmar_domain *domain,
5188                                     struct device *dev)
5189 {
5190         struct device_domain_info *info = get_domain_info(dev);
5191
5192         assert_spin_locked(&device_domain_lock);
5193         if (WARN_ON(!info))
5194                 return;
5195
5196         list_del(&domain->auxd);
5197         domain->auxd_refcnt--;
5198
5199         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5200                 ioasid_free(domain->default_pasid);
5201 }
5202
5203 static int aux_domain_add_dev(struct dmar_domain *domain,
5204                               struct device *dev)
5205 {
5206         int ret;
5207         unsigned long flags;
5208         struct intel_iommu *iommu;
5209
5210         iommu = device_to_iommu(dev, NULL, NULL);
5211         if (!iommu)
5212                 return -ENODEV;
5213
5214         if (domain->default_pasid <= 0) {
5215                 u32 pasid;
5216
5217                 /* No private data needed for the default pasid */
5218                 pasid = ioasid_alloc(NULL, PASID_MIN,
5219                                      pci_max_pasids(to_pci_dev(dev)) - 1,
5220                                      NULL);
5221                 if (pasid == INVALID_IOASID) {
5222                         pr_err("Can't allocate default pasid\n");
5223                         return -ENODEV;
5224                 }
5225                 domain->default_pasid = pasid;
5226         }
5227
5228         spin_lock_irqsave(&device_domain_lock, flags);
5229         /*
5230          * iommu->lock must be held to attach domain to iommu and setup the
5231          * pasid entry for second level translation.
5232          */
5233         spin_lock(&iommu->lock);
5234         ret = domain_attach_iommu(domain, iommu);
5235         if (ret)
5236                 goto attach_failed;
5237
5238         /* Setup the PASID entry for mediated devices: */
5239         if (domain_use_first_level(domain))
5240                 ret = domain_setup_first_level(iommu, domain, dev,
5241                                                domain->default_pasid);
5242         else
5243                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5244                                                      domain->default_pasid);
5245         if (ret)
5246                 goto table_failed;
5247         spin_unlock(&iommu->lock);
5248
5249         auxiliary_link_device(domain, dev);
5250
5251         spin_unlock_irqrestore(&device_domain_lock, flags);
5252
5253         return 0;
5254
5255 table_failed:
5256         domain_detach_iommu(domain, iommu);
5257 attach_failed:
5258         spin_unlock(&iommu->lock);
5259         spin_unlock_irqrestore(&device_domain_lock, flags);
5260         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5261                 ioasid_free(domain->default_pasid);
5262
5263         return ret;
5264 }
5265
5266 static void aux_domain_remove_dev(struct dmar_domain *domain,
5267                                   struct device *dev)
5268 {
5269         struct device_domain_info *info;
5270         struct intel_iommu *iommu;
5271         unsigned long flags;
5272
5273         if (!is_aux_domain(dev, &domain->domain))
5274                 return;
5275
5276         spin_lock_irqsave(&device_domain_lock, flags);
5277         info = get_domain_info(dev);
5278         iommu = info->iommu;
5279
5280         auxiliary_unlink_device(domain, dev);
5281
5282         spin_lock(&iommu->lock);
5283         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5284         domain_detach_iommu(domain, iommu);
5285         spin_unlock(&iommu->lock);
5286
5287         spin_unlock_irqrestore(&device_domain_lock, flags);
5288 }
5289
5290 static int prepare_domain_attach_device(struct iommu_domain *domain,
5291                                         struct device *dev)
5292 {
5293         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5294         struct intel_iommu *iommu;
5295         int addr_width;
5296
5297         iommu = device_to_iommu(dev, NULL, NULL);
5298         if (!iommu)
5299                 return -ENODEV;
5300
5301         /* check if this iommu agaw is sufficient for max mapped address */
5302         addr_width = agaw_to_width(iommu->agaw);
5303         if (addr_width > cap_mgaw(iommu->cap))
5304                 addr_width = cap_mgaw(iommu->cap);
5305
5306         if (dmar_domain->max_addr > (1LL << addr_width)) {
5307                 dev_err(dev, "%s: iommu width (%d) is not "
5308                         "sufficient for the mapped address (%llx)\n",
5309                         __func__, addr_width, dmar_domain->max_addr);
5310                 return -EFAULT;
5311         }
5312         dmar_domain->gaw = addr_width;
5313
5314         /*
5315          * Knock out extra levels of page tables if necessary
5316          */
5317         while (iommu->agaw < dmar_domain->agaw) {
5318                 struct dma_pte *pte;
5319
5320                 pte = dmar_domain->pgd;
5321                 if (dma_pte_present(pte)) {
5322                         dmar_domain->pgd = (struct dma_pte *)
5323                                 phys_to_virt(dma_pte_addr(pte));
5324                         free_pgtable_page(pte);
5325                 }
5326                 dmar_domain->agaw--;
5327         }
5328
5329         return 0;
5330 }
5331
5332 static int intel_iommu_attach_device(struct iommu_domain *domain,
5333                                      struct device *dev)
5334 {
5335         int ret;
5336
5337         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5338             device_is_rmrr_locked(dev)) {
5339                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5340                 return -EPERM;
5341         }
5342
5343         if (is_aux_domain(dev, domain))
5344                 return -EPERM;
5345
5346         /* normally dev is not mapped */
5347         if (unlikely(domain_context_mapped(dev))) {
5348                 struct dmar_domain *old_domain;
5349
5350                 old_domain = find_domain(dev);
5351                 if (old_domain)
5352                         dmar_remove_one_dev_info(dev);
5353         }
5354
5355         ret = prepare_domain_attach_device(domain, dev);
5356         if (ret)
5357                 return ret;
5358
5359         return domain_add_dev_info(to_dmar_domain(domain), dev);
5360 }
5361
5362 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5363                                          struct device *dev)
5364 {
5365         int ret;
5366
5367         if (!is_aux_domain(dev, domain))
5368                 return -EPERM;
5369
5370         ret = prepare_domain_attach_device(domain, dev);
5371         if (ret)
5372                 return ret;
5373
5374         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5375 }
5376
5377 static void intel_iommu_detach_device(struct iommu_domain *domain,
5378                                       struct device *dev)
5379 {
5380         dmar_remove_one_dev_info(dev);
5381 }
5382
5383 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5384                                           struct device *dev)
5385 {
5386         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5387 }
5388
5389 #ifdef CONFIG_INTEL_IOMMU_SVM
5390 /*
5391  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5392  * VT-d granularity. Invalidation is typically included in the unmap operation
5393  * as a result of DMA or VFIO unmap. However, for assigned devices guest
5394  * owns the first level page tables. Invalidations of translation caches in the
5395  * guest are trapped and passed down to the host.
5396  *
5397  * vIOMMU in the guest will only expose first level page tables, therefore
5398  * we do not support IOTLB granularity for request without PASID (second level).
5399  *
5400  * For example, to find the VT-d granularity encoding for IOTLB
5401  * type and page selective granularity within PASID:
5402  * X: indexed by iommu cache type
5403  * Y: indexed by enum iommu_inv_granularity
5404  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5405  */
5406
5407 static const int
5408 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5409         /*
5410          * PASID based IOTLB invalidation: PASID selective (per PASID),
5411          * page selective (address granularity)
5412          */
5413         {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5414         /* PASID based dev TLBs */
5415         {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5416         /* PASID cache */
5417         {-EINVAL, -EINVAL, -EINVAL}
5418 };
5419
5420 static inline int to_vtd_granularity(int type, int granu)
5421 {
5422         return inv_type_granu_table[type][granu];
5423 }
5424
5425 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5426 {
5427         u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5428
5429         /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5430          * IOMMU cache invalidate API passes granu_size in bytes, and number of
5431          * granu size in contiguous memory.
5432          */
5433         return order_base_2(nr_pages);
5434 }
5435
5436 static int
5437 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5438                            struct iommu_cache_invalidate_info *inv_info)
5439 {
5440         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5441         struct device_domain_info *info;
5442         struct intel_iommu *iommu;
5443         unsigned long flags;
5444         int cache_type;
5445         u8 bus, devfn;
5446         u16 did, sid;
5447         int ret = 0;
5448         u64 size = 0;
5449
5450         if (!inv_info || !dmar_domain)
5451                 return -EINVAL;
5452
5453         if (!dev || !dev_is_pci(dev))
5454                 return -ENODEV;
5455
5456         iommu = device_to_iommu(dev, &bus, &devfn);
5457         if (!iommu)
5458                 return -ENODEV;
5459
5460         if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5461                 return -EINVAL;
5462
5463         spin_lock_irqsave(&device_domain_lock, flags);
5464         spin_lock(&iommu->lock);
5465         info = get_domain_info(dev);
5466         if (!info) {
5467                 ret = -EINVAL;
5468                 goto out_unlock;
5469         }
5470         did = dmar_domain->iommu_did[iommu->seq_id];
5471         sid = PCI_DEVID(bus, devfn);
5472
5473         /* Size is only valid in address selective invalidation */
5474         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5475                 size = to_vtd_size(inv_info->granu.addr_info.granule_size,
5476                                    inv_info->granu.addr_info.nb_granules);
5477
5478         for_each_set_bit(cache_type,
5479                          (unsigned long *)&inv_info->cache,
5480                          IOMMU_CACHE_INV_TYPE_NR) {
5481                 int granu = 0;
5482                 u64 pasid = 0;
5483                 u64 addr = 0;
5484
5485                 granu = to_vtd_granularity(cache_type, inv_info->granularity);
5486                 if (granu == -EINVAL) {
5487                         pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5488                                            cache_type, inv_info->granularity);
5489                         break;
5490                 }
5491
5492                 /*
5493                  * PASID is stored in different locations based on the
5494                  * granularity.
5495                  */
5496                 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5497                     (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5498                         pasid = inv_info->granu.pasid_info.pasid;
5499                 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5500                          (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5501                         pasid = inv_info->granu.addr_info.pasid;
5502
5503                 switch (BIT(cache_type)) {
5504                 case IOMMU_CACHE_INV_TYPE_IOTLB:
5505                         /* HW will ignore LSB bits based on address mask */
5506                         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5507                             size &&
5508                             (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5509                                 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5510                                                    inv_info->granu.addr_info.addr, size);
5511                         }
5512
5513                         /*
5514                          * If granu is PASID-selective, address is ignored.
5515                          * We use npages = -1 to indicate that.
5516                          */
5517                         qi_flush_piotlb(iommu, did, pasid,
5518                                         mm_to_dma_pfn(inv_info->granu.addr_info.addr),
5519                                         (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5520                                         inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5521
5522                         if (!info->ats_enabled)
5523                                 break;
5524                         /*
5525                          * Always flush device IOTLB if ATS is enabled. vIOMMU
5526                          * in the guest may assume IOTLB flush is inclusive,
5527                          * which is more efficient.
5528                          */
5529                         fallthrough;
5530                 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5531                         /*
5532                          * PASID based device TLB invalidation does not support
5533                          * IOMMU_INV_GRANU_PASID granularity but only supports
5534                          * IOMMU_INV_GRANU_ADDR.
5535                          * The equivalent of that is we set the size to be the
5536                          * entire range of 64 bit. User only provides PASID info
5537                          * without address info. So we set addr to 0.
5538                          */
5539                         if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5540                                 size = 64 - VTD_PAGE_SHIFT;
5541                                 addr = 0;
5542                         } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5543                                 addr = inv_info->granu.addr_info.addr;
5544                         }
5545
5546                         if (info->ats_enabled)
5547                                 qi_flush_dev_iotlb_pasid(iommu, sid,
5548                                                 info->pfsid, pasid,
5549                                                 info->ats_qdep, addr,
5550                                                 size);
5551                         else
5552                                 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5553                         break;
5554                 default:
5555                         dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5556                                             cache_type);
5557                         ret = -EINVAL;
5558                 }
5559         }
5560 out_unlock:
5561         spin_unlock(&iommu->lock);
5562         spin_unlock_irqrestore(&device_domain_lock, flags);
5563
5564         return ret;
5565 }
5566 #endif
5567
5568 static int intel_iommu_map(struct iommu_domain *domain,
5569                            unsigned long iova, phys_addr_t hpa,
5570                            size_t size, int iommu_prot, gfp_t gfp)
5571 {
5572         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5573         u64 max_addr;
5574         int prot = 0;
5575         int ret;
5576
5577         if (iommu_prot & IOMMU_READ)
5578                 prot |= DMA_PTE_READ;
5579         if (iommu_prot & IOMMU_WRITE)
5580                 prot |= DMA_PTE_WRITE;
5581         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5582                 prot |= DMA_PTE_SNP;
5583
5584         max_addr = iova + size;
5585         if (dmar_domain->max_addr < max_addr) {
5586                 u64 end;
5587
5588                 /* check if minimum agaw is sufficient for mapped address */
5589                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5590                 if (end < max_addr) {
5591                         pr_err("%s: iommu width (%d) is not "
5592                                "sufficient for the mapped address (%llx)\n",
5593                                __func__, dmar_domain->gaw, max_addr);
5594                         return -EFAULT;
5595                 }
5596                 dmar_domain->max_addr = max_addr;
5597         }
5598         /* Round up size to next multiple of PAGE_SIZE, if it and
5599            the low bits of hpa would take us onto the next page */
5600         size = aligned_nrpages(hpa, size);
5601         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5602                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5603         return ret;
5604 }
5605
5606 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5607                                 unsigned long iova, size_t size,
5608                                 struct iommu_iotlb_gather *gather)
5609 {
5610         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5611         struct page *freelist = NULL;
5612         unsigned long start_pfn, last_pfn;
5613         unsigned int npages;
5614         int iommu_id, level = 0;
5615
5616         /* Cope with horrid API which requires us to unmap more than the
5617            size argument if it happens to be a large-page mapping. */
5618         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5619
5620         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5621                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5622
5623         start_pfn = iova >> VTD_PAGE_SHIFT;
5624         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5625
5626         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5627
5628         npages = last_pfn - start_pfn + 1;
5629
5630         for_each_domain_iommu(iommu_id, dmar_domain)
5631                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5632                                       start_pfn, npages, !freelist, 0);
5633
5634         dma_free_pagelist(freelist);
5635
5636         if (dmar_domain->max_addr == iova + size)
5637                 dmar_domain->max_addr = iova;
5638
5639         return size;
5640 }
5641
5642 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5643                                             dma_addr_t iova)
5644 {
5645         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5646         struct dma_pte *pte;
5647         int level = 0;
5648         u64 phys = 0;
5649
5650         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5651         if (pte && dma_pte_present(pte))
5652                 phys = dma_pte_addr(pte) +
5653                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5654                                                 VTD_PAGE_SHIFT) - 1));
5655
5656         return phys;
5657 }
5658
5659 static inline bool scalable_mode_support(void)
5660 {
5661         struct dmar_drhd_unit *drhd;
5662         struct intel_iommu *iommu;
5663         bool ret = true;
5664
5665         rcu_read_lock();
5666         for_each_active_iommu(iommu, drhd) {
5667                 if (!sm_supported(iommu)) {
5668                         ret = false;
5669                         break;
5670                 }
5671         }
5672         rcu_read_unlock();
5673
5674         return ret;
5675 }
5676
5677 static inline bool iommu_pasid_support(void)
5678 {
5679         struct dmar_drhd_unit *drhd;
5680         struct intel_iommu *iommu;
5681         bool ret = true;
5682
5683         rcu_read_lock();
5684         for_each_active_iommu(iommu, drhd) {
5685                 if (!pasid_supported(iommu)) {
5686                         ret = false;
5687                         break;
5688                 }
5689         }
5690         rcu_read_unlock();
5691
5692         return ret;
5693 }
5694
5695 static inline bool nested_mode_support(void)
5696 {
5697         struct dmar_drhd_unit *drhd;
5698         struct intel_iommu *iommu;
5699         bool ret = true;
5700
5701         rcu_read_lock();
5702         for_each_active_iommu(iommu, drhd) {
5703                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5704                         ret = false;
5705                         break;
5706                 }
5707         }
5708         rcu_read_unlock();
5709
5710         return ret;
5711 }
5712
5713 static bool intel_iommu_capable(enum iommu_cap cap)
5714 {
5715         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5716                 return domain_update_iommu_snooping(NULL) == 1;
5717         if (cap == IOMMU_CAP_INTR_REMAP)
5718                 return irq_remapping_enabled == 1;
5719
5720         return false;
5721 }
5722
5723 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5724 {
5725         struct intel_iommu *iommu;
5726
5727         iommu = device_to_iommu(dev, NULL, NULL);
5728         if (!iommu)
5729                 return ERR_PTR(-ENODEV);
5730
5731         if (translation_pre_enabled(iommu))
5732                 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5733
5734         return &iommu->iommu;
5735 }
5736
5737 static void intel_iommu_release_device(struct device *dev)
5738 {
5739         struct intel_iommu *iommu;
5740
5741         iommu = device_to_iommu(dev, NULL, NULL);
5742         if (!iommu)
5743                 return;
5744
5745         dmar_remove_one_dev_info(dev);
5746
5747         set_dma_ops(dev, NULL);
5748 }
5749
5750 static void intel_iommu_probe_finalize(struct device *dev)
5751 {
5752         struct iommu_domain *domain;
5753
5754         domain = iommu_get_domain_for_dev(dev);
5755         if (device_needs_bounce(dev))
5756                 set_dma_ops(dev, &bounce_dma_ops);
5757         else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5758                 set_dma_ops(dev, &intel_dma_ops);
5759         else
5760                 set_dma_ops(dev, NULL);
5761 }
5762
5763 static void intel_iommu_get_resv_regions(struct device *device,
5764                                          struct list_head *head)
5765 {
5766         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5767         struct iommu_resv_region *reg;
5768         struct dmar_rmrr_unit *rmrr;
5769         struct device *i_dev;
5770         int i;
5771
5772         down_read(&dmar_global_lock);
5773         for_each_rmrr_units(rmrr) {
5774                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5775                                           i, i_dev) {
5776                         struct iommu_resv_region *resv;
5777                         enum iommu_resv_type type;
5778                         size_t length;
5779
5780                         if (i_dev != device &&
5781                             !is_downstream_to_pci_bridge(device, i_dev))
5782                                 continue;
5783
5784                         length = rmrr->end_address - rmrr->base_address + 1;
5785
5786                         type = device_rmrr_is_relaxable(device) ?
5787                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5788
5789                         resv = iommu_alloc_resv_region(rmrr->base_address,
5790                                                        length, prot, type);
5791                         if (!resv)
5792                                 break;
5793
5794                         list_add_tail(&resv->list, head);
5795                 }
5796         }
5797         up_read(&dmar_global_lock);
5798
5799 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5800         if (dev_is_pci(device)) {
5801                 struct pci_dev *pdev = to_pci_dev(device);
5802
5803                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5804                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5805                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5806                         if (reg)
5807                                 list_add_tail(&reg->list, head);
5808                 }
5809         }
5810 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5811
5812         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5813                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5814                                       0, IOMMU_RESV_MSI);
5815         if (!reg)
5816                 return;
5817         list_add_tail(&reg->list, head);
5818 }
5819
5820 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5821 {
5822         struct device_domain_info *info;
5823         struct context_entry *context;
5824         struct dmar_domain *domain;
5825         unsigned long flags;
5826         u64 ctx_lo;
5827         int ret;
5828
5829         domain = find_domain(dev);
5830         if (!domain)
5831                 return -EINVAL;
5832
5833         spin_lock_irqsave(&device_domain_lock, flags);
5834         spin_lock(&iommu->lock);
5835
5836         ret = -EINVAL;
5837         info = get_domain_info(dev);
5838         if (!info || !info->pasid_supported)
5839                 goto out;
5840
5841         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5842         if (WARN_ON(!context))
5843                 goto out;
5844
5845         ctx_lo = context[0].lo;
5846
5847         if (!(ctx_lo & CONTEXT_PASIDE)) {
5848                 ctx_lo |= CONTEXT_PASIDE;
5849                 context[0].lo = ctx_lo;
5850                 wmb();
5851                 iommu->flush.flush_context(iommu,
5852                                            domain->iommu_did[iommu->seq_id],
5853                                            PCI_DEVID(info->bus, info->devfn),
5854                                            DMA_CCMD_MASK_NOBIT,
5855                                            DMA_CCMD_DEVICE_INVL);
5856         }
5857
5858         /* Enable PASID support in the device, if it wasn't already */
5859         if (!info->pasid_enabled)
5860                 iommu_enable_dev_iotlb(info);
5861
5862         ret = 0;
5863
5864  out:
5865         spin_unlock(&iommu->lock);
5866         spin_unlock_irqrestore(&device_domain_lock, flags);
5867
5868         return ret;
5869 }
5870
5871 static void intel_iommu_apply_resv_region(struct device *dev,
5872                                           struct iommu_domain *domain,
5873                                           struct iommu_resv_region *region)
5874 {
5875         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5876         unsigned long start, end;
5877
5878         start = IOVA_PFN(region->start);
5879         end   = IOVA_PFN(region->start + region->length - 1);
5880
5881         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5882 }
5883
5884 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5885 {
5886         if (dev_is_pci(dev))
5887                 return pci_device_group(dev);
5888         return generic_device_group(dev);
5889 }
5890
5891 static int intel_iommu_enable_auxd(struct device *dev)
5892 {
5893         struct device_domain_info *info;
5894         struct intel_iommu *iommu;
5895         unsigned long flags;
5896         int ret;
5897
5898         iommu = device_to_iommu(dev, NULL, NULL);
5899         if (!iommu || dmar_disabled)
5900                 return -EINVAL;
5901
5902         if (!sm_supported(iommu) || !pasid_supported(iommu))
5903                 return -EINVAL;
5904
5905         ret = intel_iommu_enable_pasid(iommu, dev);
5906         if (ret)
5907                 return -ENODEV;
5908
5909         spin_lock_irqsave(&device_domain_lock, flags);
5910         info = get_domain_info(dev);
5911         info->auxd_enabled = 1;
5912         spin_unlock_irqrestore(&device_domain_lock, flags);
5913
5914         return 0;
5915 }
5916
5917 static int intel_iommu_disable_auxd(struct device *dev)
5918 {
5919         struct device_domain_info *info;
5920         unsigned long flags;
5921
5922         spin_lock_irqsave(&device_domain_lock, flags);
5923         info = get_domain_info(dev);
5924         if (!WARN_ON(!info))
5925                 info->auxd_enabled = 0;
5926         spin_unlock_irqrestore(&device_domain_lock, flags);
5927
5928         return 0;
5929 }
5930
5931 /*
5932  * A PCI express designated vendor specific extended capability is defined
5933  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5934  * for system software and tools to detect endpoint devices supporting the
5935  * Intel scalable IO virtualization without host driver dependency.
5936  *
5937  * Returns the address of the matching extended capability structure within
5938  * the device's PCI configuration space or 0 if the device does not support
5939  * it.
5940  */
5941 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5942 {
5943         int pos;
5944         u16 vendor, id;
5945
5946         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5947         while (pos) {
5948                 pci_read_config_word(pdev, pos + 4, &vendor);
5949                 pci_read_config_word(pdev, pos + 8, &id);
5950                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5951                         return pos;
5952
5953                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5954         }
5955
5956         return 0;
5957 }
5958
5959 static bool
5960 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5961 {
5962         if (feat == IOMMU_DEV_FEAT_AUX) {
5963                 int ret;
5964
5965                 if (!dev_is_pci(dev) || dmar_disabled ||
5966                     !scalable_mode_support() || !iommu_pasid_support())
5967                         return false;
5968
5969                 ret = pci_pasid_features(to_pci_dev(dev));
5970                 if (ret < 0)
5971                         return false;
5972
5973                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5974         }
5975
5976         if (feat == IOMMU_DEV_FEAT_SVA) {
5977                 struct device_domain_info *info = get_domain_info(dev);
5978
5979                 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5980                         info->pasid_supported && info->pri_supported &&
5981                         info->ats_supported;
5982         }
5983
5984         return false;
5985 }
5986
5987 static int
5988 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5989 {
5990         if (feat == IOMMU_DEV_FEAT_AUX)
5991                 return intel_iommu_enable_auxd(dev);
5992
5993         if (feat == IOMMU_DEV_FEAT_SVA) {
5994                 struct device_domain_info *info = get_domain_info(dev);
5995
5996                 if (!info)
5997                         return -EINVAL;
5998
5999                 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
6000                         return 0;
6001         }
6002
6003         return -ENODEV;
6004 }
6005
6006 static int
6007 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6008 {
6009         if (feat == IOMMU_DEV_FEAT_AUX)
6010                 return intel_iommu_disable_auxd(dev);
6011
6012         return -ENODEV;
6013 }
6014
6015 static bool
6016 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6017 {
6018         struct device_domain_info *info = get_domain_info(dev);
6019
6020         if (feat == IOMMU_DEV_FEAT_AUX)
6021                 return scalable_mode_support() && info && info->auxd_enabled;
6022
6023         return false;
6024 }
6025
6026 static int
6027 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6028 {
6029         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6030
6031         return dmar_domain->default_pasid > 0 ?
6032                         dmar_domain->default_pasid : -EINVAL;
6033 }
6034
6035 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6036                                            struct device *dev)
6037 {
6038         return attach_deferred(dev);
6039 }
6040
6041 static int
6042 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6043                             enum iommu_attr attr, void *data)
6044 {
6045         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6046         unsigned long flags;
6047         int ret = 0;
6048
6049         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6050                 return -EINVAL;
6051
6052         switch (attr) {
6053         case DOMAIN_ATTR_NESTING:
6054                 spin_lock_irqsave(&device_domain_lock, flags);
6055                 if (nested_mode_support() &&
6056                     list_empty(&dmar_domain->devices)) {
6057                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6058                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6059                 } else {
6060                         ret = -ENODEV;
6061                 }
6062                 spin_unlock_irqrestore(&device_domain_lock, flags);
6063                 break;
6064         default:
6065                 ret = -EINVAL;
6066                 break;
6067         }
6068
6069         return ret;
6070 }
6071
6072 /*
6073  * Check that the device does not live on an external facing PCI port that is
6074  * marked as untrusted. Such devices should not be able to apply quirks and
6075  * thus not be able to bypass the IOMMU restrictions.
6076  */
6077 static bool risky_device(struct pci_dev *pdev)
6078 {
6079         if (pdev->untrusted) {
6080                 pci_info(pdev,
6081                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6082                          pdev->vendor, pdev->device);
6083                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6084                 return true;
6085         }
6086         return false;
6087 }
6088
6089 const struct iommu_ops intel_iommu_ops = {
6090         .capable                = intel_iommu_capable,
6091         .domain_alloc           = intel_iommu_domain_alloc,
6092         .domain_free            = intel_iommu_domain_free,
6093         .domain_set_attr        = intel_iommu_domain_set_attr,
6094         .attach_dev             = intel_iommu_attach_device,
6095         .detach_dev             = intel_iommu_detach_device,
6096         .aux_attach_dev         = intel_iommu_aux_attach_device,
6097         .aux_detach_dev         = intel_iommu_aux_detach_device,
6098         .aux_get_pasid          = intel_iommu_aux_get_pasid,
6099         .map                    = intel_iommu_map,
6100         .unmap                  = intel_iommu_unmap,
6101         .iova_to_phys           = intel_iommu_iova_to_phys,
6102         .probe_device           = intel_iommu_probe_device,
6103         .probe_finalize         = intel_iommu_probe_finalize,
6104         .release_device         = intel_iommu_release_device,
6105         .get_resv_regions       = intel_iommu_get_resv_regions,
6106         .put_resv_regions       = generic_iommu_put_resv_regions,
6107         .apply_resv_region      = intel_iommu_apply_resv_region,
6108         .device_group           = intel_iommu_device_group,
6109         .dev_has_feat           = intel_iommu_dev_has_feat,
6110         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6111         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6112         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6113         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6114         .def_domain_type        = device_def_domain_type,
6115         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6116 #ifdef CONFIG_INTEL_IOMMU_SVM
6117         .cache_invalidate       = intel_iommu_sva_invalidate,
6118         .sva_bind_gpasid        = intel_svm_bind_gpasid,
6119         .sva_unbind_gpasid      = intel_svm_unbind_gpasid,
6120         .sva_bind               = intel_svm_bind,
6121         .sva_unbind             = intel_svm_unbind,
6122         .sva_get_pasid          = intel_svm_get_pasid,
6123         .page_response          = intel_svm_page_response,
6124 #endif
6125 };
6126
6127 static void quirk_iommu_igfx(struct pci_dev *dev)
6128 {
6129         if (risky_device(dev))
6130                 return;
6131
6132         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6133         dmar_map_gfx = 0;
6134 }
6135
6136 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6137 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6138 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6139 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6140 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6141 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6142 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6143 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6144
6145 /* Broadwell igfx malfunctions with dmar */
6146 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6147 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6148 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6149 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6150 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6151 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6152 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6153 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6154 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6155 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6156 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6157 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6158 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6159 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6160 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6161 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6162 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6163 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6164 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6165 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6166 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6167 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6168 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6169 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6170
6171 static void quirk_iommu_rwbf(struct pci_dev *dev)
6172 {
6173         if (risky_device(dev))
6174                 return;
6175
6176         /*
6177          * Mobile 4 Series Chipset neglects to set RWBF capability,
6178          * but needs it. Same seems to hold for the desktop versions.
6179          */
6180         pci_info(dev, "Forcing write-buffer flush capability\n");
6181         rwbf_quirk = 1;
6182 }
6183
6184 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6185 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6186 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6187 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6188 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6189 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6190 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6191
6192 #define GGC 0x52
6193 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6194 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6195 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6196 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6197 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6198 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6199 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6200 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6201
6202 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6203 {
6204         unsigned short ggc;
6205
6206         if (risky_device(dev))
6207                 return;
6208
6209         if (pci_read_config_word(dev, GGC, &ggc))
6210                 return;
6211
6212         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6213                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6214                 dmar_map_gfx = 0;
6215         } else if (dmar_map_gfx) {
6216                 /* we have to ensure the gfx device is idle before we flush */
6217                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6218                 intel_iommu_strict = 1;
6219        }
6220 }
6221 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6222 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6223 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6224 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6225
6226 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6227 {
6228         unsigned short ver;
6229
6230         if (!IS_GFX_DEVICE(dev))
6231                 return;
6232
6233         ver = (dev->device >> 8) & 0xff;
6234         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6235             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6236             ver != 0x9a)
6237                 return;
6238
6239         if (risky_device(dev))
6240                 return;
6241
6242         pci_info(dev, "Skip IOMMU disabling for graphics\n");
6243         iommu_skip_te_disable = 1;
6244 }
6245 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6246
6247 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6248    ISOCH DMAR unit for the Azalia sound device, but not give it any
6249    TLB entries, which causes it to deadlock. Check for that.  We do
6250    this in a function called from init_dmars(), instead of in a PCI
6251    quirk, because we don't want to print the obnoxious "BIOS broken"
6252    message if VT-d is actually disabled.
6253 */
6254 static void __init check_tylersburg_isoch(void)
6255 {
6256         struct pci_dev *pdev;
6257         uint32_t vtisochctrl;
6258
6259         /* If there's no Azalia in the system anyway, forget it. */
6260         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6261         if (!pdev)
6262                 return;
6263
6264         if (risky_device(pdev)) {
6265                 pci_dev_put(pdev);
6266                 return;
6267         }
6268
6269         pci_dev_put(pdev);
6270
6271         /* System Management Registers. Might be hidden, in which case
6272            we can't do the sanity check. But that's OK, because the
6273            known-broken BIOSes _don't_ actually hide it, so far. */
6274         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6275         if (!pdev)
6276                 return;
6277
6278         if (risky_device(pdev)) {
6279                 pci_dev_put(pdev);
6280                 return;
6281         }
6282
6283         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6284                 pci_dev_put(pdev);
6285                 return;
6286         }
6287
6288         pci_dev_put(pdev);
6289
6290         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6291         if (vtisochctrl & 1)
6292                 return;
6293
6294         /* Drop all bits other than the number of TLB entries */
6295         vtisochctrl &= 0x1c;
6296
6297         /* If we have the recommended number of TLB entries (16), fine. */
6298         if (vtisochctrl == 0x10)
6299                 return;
6300
6301         /* Zero TLB entries? You get to ride the short bus to school. */
6302         if (!vtisochctrl) {
6303                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6304                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6305                      dmi_get_system_info(DMI_BIOS_VENDOR),
6306                      dmi_get_system_info(DMI_BIOS_VERSION),
6307                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6308                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6309                 return;
6310         }
6311
6312         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6313                vtisochctrl);
6314 }