perf annotate: Fix segfault with source toggle
[linux-2.6-microblaze.git] / drivers / iommu / arm-smmu-v3.c
index effe72e..aa3ac2a 100644 (file)
@@ -21,8 +21,7 @@
 #include <linux/io-pgtable.h>
 #include <linux/iommu.h>
 #include <linux/iopoll.h>
-#include <linux/init.h>
-#include <linux/moduleparam.h>
+#include <linux/module.h>
 #include <linux/msi.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
 
 #define STRTAB_STE_0_S1FMT             GENMASK_ULL(5, 4)
 #define STRTAB_STE_0_S1FMT_LINEAR      0
+#define STRTAB_STE_0_S1FMT_64K_L2      2
 #define STRTAB_STE_0_S1CTXPTR_MASK     GENMASK_ULL(51, 6)
 #define STRTAB_STE_0_S1CDMAX           GENMASK_ULL(63, 59)
 
+#define STRTAB_STE_1_S1DSS             GENMASK_ULL(1, 0)
+#define STRTAB_STE_1_S1DSS_TERMINATE   0x0
+#define STRTAB_STE_1_S1DSS_BYPASS      0x1
+#define STRTAB_STE_1_S1DSS_SSID0       0x2
+
 #define STRTAB_STE_1_S1C_CACHE_NC      0UL
 #define STRTAB_STE_1_S1C_CACHE_WBRA    1UL
 #define STRTAB_STE_1_S1C_CACHE_WT      2UL
 
 #define STRTAB_STE_2_S2VMID            GENMASK_ULL(15, 0)
 #define STRTAB_STE_2_VTCR              GENMASK_ULL(50, 32)
+#define STRTAB_STE_2_VTCR_S2T0SZ       GENMASK_ULL(5, 0)
+#define STRTAB_STE_2_VTCR_S2SL0                GENMASK_ULL(7, 6)
+#define STRTAB_STE_2_VTCR_S2IR0                GENMASK_ULL(9, 8)
+#define STRTAB_STE_2_VTCR_S2OR0                GENMASK_ULL(11, 10)
+#define STRTAB_STE_2_VTCR_S2SH0                GENMASK_ULL(13, 12)
+#define STRTAB_STE_2_VTCR_S2TG         GENMASK_ULL(15, 14)
+#define STRTAB_STE_2_VTCR_S2PS         GENMASK_ULL(18, 16)
 #define STRTAB_STE_2_S2AA64            (1UL << 51)
 #define STRTAB_STE_2_S2ENDI            (1UL << 52)
 #define STRTAB_STE_2_S2PTW             (1UL << 54)
 
 #define STRTAB_STE_3_S2TTB_MASK                GENMASK_ULL(51, 4)
 
-/* Context descriptor (stage-1 only) */
+/*
+ * Context descriptors.
+ *
+ * Linear: when less than 1024 SSIDs are supported
+ * 2lvl: at most 1024 L1 entries,
+ *       1024 lazy entries per table.
+ */
+#define CTXDESC_SPLIT                  10
+#define CTXDESC_L2_ENTRIES             (1 << CTXDESC_SPLIT)
+
+#define CTXDESC_L1_DESC_DWORDS         1
+#define CTXDESC_L1_DESC_V              (1UL << 0)
+#define CTXDESC_L1_DESC_L2PTR_MASK     GENMASK_ULL(51, 12)
+
 #define CTXDESC_CD_DWORDS              8
 #define CTXDESC_CD_0_TCR_T0SZ          GENMASK_ULL(5, 0)
-#define ARM64_TCR_T0SZ                 GENMASK_ULL(5, 0)
 #define CTXDESC_CD_0_TCR_TG0           GENMASK_ULL(7, 6)
-#define ARM64_TCR_TG0                  GENMASK_ULL(15, 14)
 #define CTXDESC_CD_0_TCR_IRGN0         GENMASK_ULL(9, 8)
-#define ARM64_TCR_IRGN0                        GENMASK_ULL(9, 8)
 #define CTXDESC_CD_0_TCR_ORGN0         GENMASK_ULL(11, 10)
-#define ARM64_TCR_ORGN0                        GENMASK_ULL(11, 10)
 #define CTXDESC_CD_0_TCR_SH0           GENMASK_ULL(13, 12)
-#define ARM64_TCR_SH0                  GENMASK_ULL(13, 12)
 #define CTXDESC_CD_0_TCR_EPD0          (1ULL << 14)
-#define ARM64_TCR_EPD0                 (1ULL << 7)
 #define CTXDESC_CD_0_TCR_EPD1          (1ULL << 30)
-#define ARM64_TCR_EPD1                 (1ULL << 23)
 
 #define CTXDESC_CD_0_ENDI              (1UL << 15)
 #define CTXDESC_CD_0_V                 (1UL << 31)
 
 #define CTXDESC_CD_0_TCR_IPS           GENMASK_ULL(34, 32)
-#define ARM64_TCR_IPS                  GENMASK_ULL(34, 32)
 #define CTXDESC_CD_0_TCR_TBI0          (1ULL << 38)
-#define ARM64_TCR_TBI0                 (1ULL << 37)
 
 #define CTXDESC_CD_0_AA64              (1UL << 41)
 #define CTXDESC_CD_0_S                 (1UL << 44)
 
 #define CTXDESC_CD_1_TTB0_MASK         GENMASK_ULL(51, 4)
 
-/* Convert between AArch64 (CPU) TCR format and SMMU CD format */
-#define ARM_SMMU_TCR2CD(tcr, fld)      FIELD_PREP(CTXDESC_CD_0_TCR_##fld, \
-                                       FIELD_GET(ARM64_TCR_##fld, tcr))
+/*
+ * When the SMMU only supports linear context descriptor tables, pick a
+ * reasonable size limit (64kB).
+ */
+#define CTXDESC_LINEAR_CDMAX           ilog2(SZ_64K / (CTXDESC_CD_DWORDS << 3))
 
 /* Command queue */
 #define CMDQ_ENT_SZ_SHIFT              4
 #define CMDQ_PREFETCH_1_SIZE           GENMASK_ULL(4, 0)
 #define CMDQ_PREFETCH_1_ADDR_MASK      GENMASK_ULL(63, 12)
 
+#define CMDQ_CFGI_0_SSID               GENMASK_ULL(31, 12)
 #define CMDQ_CFGI_0_SID                        GENMASK_ULL(63, 32)
 #define CMDQ_CFGI_1_LEAF               (1UL << 0)
 #define CMDQ_CFGI_1_RANGE              GENMASK_ULL(4, 0)
 #define MSI_IOVA_BASE                  0x8000000
 #define MSI_IOVA_LENGTH                        0x100000
 
-/*
- * not really modular, but the easiest way to keep compat with existing
- * bootargs behaviour is to continue using module_param_named here.
- */
 static bool disable_bypass = 1;
 module_param_named(disable_bypass, disable_bypass, bool, S_IRUGO);
 MODULE_PARM_DESC(disable_bypass,
@@ -440,8 +455,11 @@ struct arm_smmu_cmdq_ent {
 
                #define CMDQ_OP_CFGI_STE        0x3
                #define CMDQ_OP_CFGI_ALL        0x4
+               #define CMDQ_OP_CFGI_CD         0x5
+               #define CMDQ_OP_CFGI_CD_ALL     0x6
                struct {
                        u32                     sid;
+                       u32                     ssid;
                        union {
                                bool            leaf;
                                u8              span;
@@ -547,16 +565,30 @@ struct arm_smmu_strtab_l1_desc {
        dma_addr_t                      l2ptr_dma;
 };
 
+struct arm_smmu_ctx_desc {
+       u16                             asid;
+       u64                             ttbr;
+       u64                             tcr;
+       u64                             mair;
+};
+
+struct arm_smmu_l1_ctx_desc {
+       __le64                          *l2ptr;
+       dma_addr_t                      l2ptr_dma;
+};
+
+struct arm_smmu_ctx_desc_cfg {
+       __le64                          *cdtab;
+       dma_addr_t                      cdtab_dma;
+       struct arm_smmu_l1_ctx_desc     *l1_desc;
+       unsigned int                    num_l1_ents;
+};
+
 struct arm_smmu_s1_cfg {
-       __le64                          *cdptr;
-       dma_addr_t                      cdptr_dma;
-
-       struct arm_smmu_ctx_desc {
-               u16     asid;
-               u64     ttbr;
-               u64     tcr;
-               u64     mair;
-       }                               cd;
+       struct arm_smmu_ctx_desc_cfg    cdcfg;
+       struct arm_smmu_ctx_desc        cd;
+       u8                              s1fmt;
+       u8                              s1cdmax;
 };
 
 struct arm_smmu_s2_cfg {
@@ -638,6 +670,7 @@ struct arm_smmu_master {
        u32                             *sids;
        unsigned int                    num_sids;
        bool                            ats_enabled;
+       unsigned int                    ssid_bits;
 };
 
 /* SMMU private data for an IOMMU domain */
@@ -847,15 +880,22 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
                cmd[1] |= FIELD_PREP(CMDQ_PREFETCH_1_SIZE, ent->prefetch.size);
                cmd[1] |= ent->prefetch.addr & CMDQ_PREFETCH_1_ADDR_MASK;
                break;
+       case CMDQ_OP_CFGI_CD:
+               cmd[0] |= FIELD_PREP(CMDQ_CFGI_0_SSID, ent->cfgi.ssid);
+               /* Fallthrough */
        case CMDQ_OP_CFGI_STE:
                cmd[0] |= FIELD_PREP(CMDQ_CFGI_0_SID, ent->cfgi.sid);
                cmd[1] |= FIELD_PREP(CMDQ_CFGI_1_LEAF, ent->cfgi.leaf);
                break;
+       case CMDQ_OP_CFGI_CD_ALL:
+               cmd[0] |= FIELD_PREP(CMDQ_CFGI_0_SID, ent->cfgi.sid);
+               break;
        case CMDQ_OP_CFGI_ALL:
                /* Cover the entire SID range */
                cmd[1] |= FIELD_PREP(CMDQ_CFGI_1_RANGE, 31);
                break;
        case CMDQ_OP_TLBI_NH_VA:
+               cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
                cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
                cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
                cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK;
@@ -1443,50 +1483,238 @@ static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
 }
 
 /* Context descriptor manipulation functions */
-static u64 arm_smmu_cpu_tcr_to_cd(u64 tcr)
+static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain,
+                            int ssid, bool leaf)
 {
-       u64 val = 0;
+       size_t i;
+       unsigned long flags;
+       struct arm_smmu_master *master;
+       struct arm_smmu_device *smmu = smmu_domain->smmu;
+       struct arm_smmu_cmdq_ent cmd = {
+               .opcode = CMDQ_OP_CFGI_CD,
+               .cfgi   = {
+                       .ssid   = ssid,
+                       .leaf   = leaf,
+               },
+       };
+
+       spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+       list_for_each_entry(master, &smmu_domain->devices, domain_head) {
+               for (i = 0; i < master->num_sids; i++) {
+                       cmd.cfgi.sid = master->sids[i];
+                       arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+               }
+       }
+       spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+
+       arm_smmu_cmdq_issue_sync(smmu);
+}
 
-       /* Repack the TCR. Just care about TTBR0 for now */
-       val |= ARM_SMMU_TCR2CD(tcr, T0SZ);
-       val |= ARM_SMMU_TCR2CD(tcr, TG0);
-       val |= ARM_SMMU_TCR2CD(tcr, IRGN0);
-       val |= ARM_SMMU_TCR2CD(tcr, ORGN0);
-       val |= ARM_SMMU_TCR2CD(tcr, SH0);
-       val |= ARM_SMMU_TCR2CD(tcr, EPD0);
-       val |= ARM_SMMU_TCR2CD(tcr, EPD1);
-       val |= ARM_SMMU_TCR2CD(tcr, IPS);
+static int arm_smmu_alloc_cd_leaf_table(struct arm_smmu_device *smmu,
+                                       struct arm_smmu_l1_ctx_desc *l1_desc)
+{
+       size_t size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3);
 
-       return val;
+       l1_desc->l2ptr = dmam_alloc_coherent(smmu->dev, size,
+                                            &l1_desc->l2ptr_dma, GFP_KERNEL);
+       if (!l1_desc->l2ptr) {
+               dev_warn(smmu->dev,
+                        "failed to allocate context descriptor table\n");
+               return -ENOMEM;
+       }
+       return 0;
 }
 
-static void arm_smmu_write_ctx_desc(struct arm_smmu_device *smmu,
-                                   struct arm_smmu_s1_cfg *cfg)
+static void arm_smmu_write_cd_l1_desc(__le64 *dst,
+                                     struct arm_smmu_l1_ctx_desc *l1_desc)
 {
-       u64 val;
+       u64 val = (l1_desc->l2ptr_dma & CTXDESC_L1_DESC_L2PTR_MASK) |
+                 CTXDESC_L1_DESC_V;
+
+       WRITE_ONCE(*dst, cpu_to_le64(val));
+}
+
+static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_domain *smmu_domain,
+                                  u32 ssid)
+{
+       __le64 *l1ptr;
+       unsigned int idx;
+       struct arm_smmu_l1_ctx_desc *l1_desc;
+       struct arm_smmu_device *smmu = smmu_domain->smmu;
+       struct arm_smmu_ctx_desc_cfg *cdcfg = &smmu_domain->s1_cfg.cdcfg;
 
+       if (smmu_domain->s1_cfg.s1fmt == STRTAB_STE_0_S1FMT_LINEAR)
+               return cdcfg->cdtab + ssid * CTXDESC_CD_DWORDS;
+
+       idx = ssid >> CTXDESC_SPLIT;
+       l1_desc = &cdcfg->l1_desc[idx];
+       if (!l1_desc->l2ptr) {
+               if (arm_smmu_alloc_cd_leaf_table(smmu, l1_desc))
+                       return NULL;
+
+               l1ptr = cdcfg->cdtab + idx * CTXDESC_L1_DESC_DWORDS;
+               arm_smmu_write_cd_l1_desc(l1ptr, l1_desc);
+               /* An invalid L1CD can be cached */
+               arm_smmu_sync_cd(smmu_domain, ssid, false);
+       }
+       idx = ssid & (CTXDESC_L2_ENTRIES - 1);
+       return l1_desc->l2ptr + idx * CTXDESC_CD_DWORDS;
+}
+
+static int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain,
+                                  int ssid, struct arm_smmu_ctx_desc *cd)
+{
        /*
-        * We don't need to issue any invalidation here, as we'll invalidate
-        * the STE when installing the new entry anyway.
+        * This function handles the following cases:
+        *
+        * (1) Install primary CD, for normal DMA traffic (SSID = 0).
+        * (2) Install a secondary CD, for SID+SSID traffic.
+        * (3) Update ASID of a CD. Atomically write the first 64 bits of the
+        *     CD, then invalidate the old entry and mappings.
+        * (4) Remove a secondary CD.
         */
-       val = arm_smmu_cpu_tcr_to_cd(cfg->cd.tcr) |
+       u64 val;
+       bool cd_live;
+       __le64 *cdptr;
+       struct arm_smmu_device *smmu = smmu_domain->smmu;
+
+       if (WARN_ON(ssid >= (1 << smmu_domain->s1_cfg.s1cdmax)))
+               return -E2BIG;
+
+       cdptr = arm_smmu_get_cd_ptr(smmu_domain, ssid);
+       if (!cdptr)
+               return -ENOMEM;
+
+       val = le64_to_cpu(cdptr[0]);
+       cd_live = !!(val & CTXDESC_CD_0_V);
+
+       if (!cd) { /* (4) */
+               val = 0;
+       } else if (cd_live) { /* (3) */
+               val &= ~CTXDESC_CD_0_ASID;
+               val |= FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid);
+               /*
+                * Until CD+TLB invalidation, both ASIDs may be used for tagging
+                * this substream's traffic
+                */
+       } else { /* (1) and (2) */
+               cdptr[1] = cpu_to_le64(cd->ttbr & CTXDESC_CD_1_TTB0_MASK);
+               cdptr[2] = 0;
+               cdptr[3] = cpu_to_le64(cd->mair);
+
+               /*
+                * STE is live, and the SMMU might read dwords of this CD in any
+                * order. Ensure that it observes valid values before reading
+                * V=1.
+                */
+               arm_smmu_sync_cd(smmu_domain, ssid, true);
+
+               val = cd->tcr |
 #ifdef __BIG_ENDIAN
-             CTXDESC_CD_0_ENDI |
+                       CTXDESC_CD_0_ENDI |
 #endif
-             CTXDESC_CD_0_R | CTXDESC_CD_0_A | CTXDESC_CD_0_ASET |
-             CTXDESC_CD_0_AA64 | FIELD_PREP(CTXDESC_CD_0_ASID, cfg->cd.asid) |
-             CTXDESC_CD_0_V;
+                       CTXDESC_CD_0_R | CTXDESC_CD_0_A | CTXDESC_CD_0_ASET |
+                       CTXDESC_CD_0_AA64 |
+                       FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid) |
+                       CTXDESC_CD_0_V;
 
-       /* STALL_MODEL==0b10 && CD.S==0 is ILLEGAL */
-       if (smmu->features & ARM_SMMU_FEAT_STALL_FORCE)
-               val |= CTXDESC_CD_0_S;
+               /* STALL_MODEL==0b10 && CD.S==0 is ILLEGAL */
+               if (smmu->features & ARM_SMMU_FEAT_STALL_FORCE)
+                       val |= CTXDESC_CD_0_S;
+       }
 
-       cfg->cdptr[0] = cpu_to_le64(val);
+       /*
+        * The SMMU accesses 64-bit values atomically. See IHI0070Ca 3.21.3
+        * "Configuration structures and configuration invalidation completion"
+        *
+        *   The size of single-copy atomic reads made by the SMMU is
+        *   IMPLEMENTATION DEFINED but must be at least 64 bits. Any single
+        *   field within an aligned 64-bit span of a structure can be altered
+        *   without first making the structure invalid.
+        */
+       WRITE_ONCE(cdptr[0], cpu_to_le64(val));
+       arm_smmu_sync_cd(smmu_domain, ssid, true);
+       return 0;
+}
+
+static int arm_smmu_alloc_cd_tables(struct arm_smmu_domain *smmu_domain)
+{
+       int ret;
+       size_t l1size;
+       size_t max_contexts;
+       struct arm_smmu_device *smmu = smmu_domain->smmu;
+       struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg;
+       struct arm_smmu_ctx_desc_cfg *cdcfg = &cfg->cdcfg;
+
+       max_contexts = 1 << cfg->s1cdmax;
+
+       if (!(smmu->features & ARM_SMMU_FEAT_2_LVL_CDTAB) ||
+           max_contexts <= CTXDESC_L2_ENTRIES) {
+               cfg->s1fmt = STRTAB_STE_0_S1FMT_LINEAR;
+               cdcfg->num_l1_ents = max_contexts;
 
-       val = cfg->cd.ttbr & CTXDESC_CD_1_TTB0_MASK;
-       cfg->cdptr[1] = cpu_to_le64(val);
+               l1size = max_contexts * (CTXDESC_CD_DWORDS << 3);
+       } else {
+               cfg->s1fmt = STRTAB_STE_0_S1FMT_64K_L2;
+               cdcfg->num_l1_ents = DIV_ROUND_UP(max_contexts,
+                                                 CTXDESC_L2_ENTRIES);
+
+               cdcfg->l1_desc = devm_kcalloc(smmu->dev, cdcfg->num_l1_ents,
+                                             sizeof(*cdcfg->l1_desc),
+                                             GFP_KERNEL);
+               if (!cdcfg->l1_desc)
+                       return -ENOMEM;
+
+               l1size = cdcfg->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3);
+       }
+
+       cdcfg->cdtab = dmam_alloc_coherent(smmu->dev, l1size, &cdcfg->cdtab_dma,
+                                          GFP_KERNEL);
+       if (!cdcfg->cdtab) {
+               dev_warn(smmu->dev, "failed to allocate context descriptor\n");
+               ret = -ENOMEM;
+               goto err_free_l1;
+       }
+
+       return 0;
 
-       cfg->cdptr[3] = cpu_to_le64(cfg->cd.mair);
+err_free_l1:
+       if (cdcfg->l1_desc) {
+               devm_kfree(smmu->dev, cdcfg->l1_desc);
+               cdcfg->l1_desc = NULL;
+       }
+       return ret;
+}
+
+static void arm_smmu_free_cd_tables(struct arm_smmu_domain *smmu_domain)
+{
+       int i;
+       size_t size, l1size;
+       struct arm_smmu_device *smmu = smmu_domain->smmu;
+       struct arm_smmu_ctx_desc_cfg *cdcfg = &smmu_domain->s1_cfg.cdcfg;
+
+       if (cdcfg->l1_desc) {
+               size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3);
+
+               for (i = 0; i < cdcfg->num_l1_ents; i++) {
+                       if (!cdcfg->l1_desc[i].l2ptr)
+                               continue;
+
+                       dmam_free_coherent(smmu->dev, size,
+                                          cdcfg->l1_desc[i].l2ptr,
+                                          cdcfg->l1_desc[i].l2ptr_dma);
+               }
+               devm_kfree(smmu->dev, cdcfg->l1_desc);
+               cdcfg->l1_desc = NULL;
+
+               l1size = cdcfg->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3);
+       } else {
+               l1size = cdcfg->num_l1_ents * (CTXDESC_CD_DWORDS << 3);
+       }
+
+       dmam_free_coherent(smmu->dev, l1size, cdcfg->cdtab, cdcfg->cdtab_dma);
+       cdcfg->cdtab_dma = 0;
+       cdcfg->cdtab = NULL;
 }
 
 /* Stream table manipulation functions */
@@ -1608,6 +1836,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
        if (s1_cfg) {
                BUG_ON(ste_live);
                dst[1] = cpu_to_le64(
+                        FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) |
                         FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) |
                         FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) |
                         FIELD_PREP(STRTAB_STE_1_S1CSH, ARM_SMMU_SH_ISH) |
@@ -1617,8 +1846,10 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
                   !(smmu->features & ARM_SMMU_FEAT_STALL_FORCE))
                        dst[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD);
 
-               val |= (s1_cfg->cdptr_dma & STRTAB_STE_0_S1CTXPTR_MASK) |
-                       FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS);
+               val |= (s1_cfg->cdcfg.cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK) |
+                       FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS) |
+                       FIELD_PREP(STRTAB_STE_0_S1CDMAX, s1_cfg->s1cdmax) |
+                       FIELD_PREP(STRTAB_STE_0_S1FMT, s1_cfg->s1fmt);
        }
 
        if (s2_cfg) {
@@ -1642,7 +1873,8 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
                                                 STRTAB_STE_1_EATS_TRANS));
 
        arm_smmu_sync_ste_for_sid(smmu, sid);
-       dst[0] = cpu_to_le64(val);
+       /* See comment in arm_smmu_write_ctx_desc() */
+       WRITE_ONCE(dst[0], cpu_to_le64(val));
        arm_smmu_sync_ste_for_sid(smmu, sid);
 
        /* It's likely that we'll want to use the new STE soon */
@@ -1675,7 +1907,7 @@ static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid)
 
        desc->span = STRTAB_SPLIT + 1;
        desc->l2ptr = dmam_alloc_coherent(smmu->dev, size, &desc->l2ptr_dma,
-                                         GFP_KERNEL | __GFP_ZERO);
+                                         GFP_KERNEL);
        if (!desc->l2ptr) {
                dev_err(smmu->dev,
                        "failed to allocate l2 stream table for SID %u\n",
@@ -2131,12 +2363,8 @@ static void arm_smmu_domain_free(struct iommu_domain *domain)
        if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
                struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg;
 
-               if (cfg->cdptr) {
-                       dmam_free_coherent(smmu_domain->smmu->dev,
-                                          CTXDESC_CD_DWORDS << 3,
-                                          cfg->cdptr,
-                                          cfg->cdptr_dma);
-
+               if (cfg->cdcfg.cdtab) {
+                       arm_smmu_free_cd_tables(smmu_domain);
                        arm_smmu_bitmap_free(smmu->asid_map, cfg->cd.asid);
                }
        } else {
@@ -2149,55 +2377,82 @@ static void arm_smmu_domain_free(struct iommu_domain *domain)
 }
 
 static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain,
+                                      struct arm_smmu_master *master,
                                       struct io_pgtable_cfg *pgtbl_cfg)
 {
        int ret;
        int asid;
        struct arm_smmu_device *smmu = smmu_domain->smmu;
        struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg;
+       typeof(&pgtbl_cfg->arm_lpae_s1_cfg.tcr) tcr = &pgtbl_cfg->arm_lpae_s1_cfg.tcr;
 
        asid = arm_smmu_bitmap_alloc(smmu->asid_map, smmu->asid_bits);
        if (asid < 0)
                return asid;
 
-       cfg->cdptr = dmam_alloc_coherent(smmu->dev, CTXDESC_CD_DWORDS << 3,
-                                        &cfg->cdptr_dma,
-                                        GFP_KERNEL | __GFP_ZERO);
-       if (!cfg->cdptr) {
-               dev_warn(smmu->dev, "failed to allocate context descriptor\n");
-               ret = -ENOMEM;
+       cfg->s1cdmax = master->ssid_bits;
+
+       ret = arm_smmu_alloc_cd_tables(smmu_domain);
+       if (ret)
                goto out_free_asid;
-       }
 
        cfg->cd.asid    = (u16)asid;
-       cfg->cd.ttbr    = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[0];
-       cfg->cd.tcr     = pgtbl_cfg->arm_lpae_s1_cfg.tcr;
+       cfg->cd.ttbr    = pgtbl_cfg->arm_lpae_s1_cfg.ttbr;
+       cfg->cd.tcr     = FIELD_PREP(CTXDESC_CD_0_TCR_T0SZ, tcr->tsz) |
+                         FIELD_PREP(CTXDESC_CD_0_TCR_TG0, tcr->tg) |
+                         FIELD_PREP(CTXDESC_CD_0_TCR_IRGN0, tcr->irgn) |
+                         FIELD_PREP(CTXDESC_CD_0_TCR_ORGN0, tcr->orgn) |
+                         FIELD_PREP(CTXDESC_CD_0_TCR_SH0, tcr->sh) |
+                         FIELD_PREP(CTXDESC_CD_0_TCR_IPS, tcr->ips) |
+                         CTXDESC_CD_0_TCR_EPD1 | CTXDESC_CD_0_AA64;
        cfg->cd.mair    = pgtbl_cfg->arm_lpae_s1_cfg.mair;
+
+       /*
+        * Note that this will end up calling arm_smmu_sync_cd() before
+        * the master has been added to the devices list for this domain.
+        * This isn't an issue because the STE hasn't been installed yet.
+        */
+       ret = arm_smmu_write_ctx_desc(smmu_domain, 0, &cfg->cd);
+       if (ret)
+               goto out_free_cd_tables;
+
        return 0;
 
+out_free_cd_tables:
+       arm_smmu_free_cd_tables(smmu_domain);
 out_free_asid:
        arm_smmu_bitmap_free(smmu->asid_map, asid);
        return ret;
 }
 
 static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain,
+                                      struct arm_smmu_master *master,
                                       struct io_pgtable_cfg *pgtbl_cfg)
 {
        int vmid;
        struct arm_smmu_device *smmu = smmu_domain->smmu;
        struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg;
+       typeof(&pgtbl_cfg->arm_lpae_s2_cfg.vtcr) vtcr;
 
        vmid = arm_smmu_bitmap_alloc(smmu->vmid_map, smmu->vmid_bits);
        if (vmid < 0)
                return vmid;
 
+       vtcr = &pgtbl_cfg->arm_lpae_s2_cfg.vtcr;
        cfg->vmid       = (u16)vmid;
        cfg->vttbr      = pgtbl_cfg->arm_lpae_s2_cfg.vttbr;
-       cfg->vtcr       = pgtbl_cfg->arm_lpae_s2_cfg.vtcr;
+       cfg->vtcr       = FIELD_PREP(STRTAB_STE_2_VTCR_S2T0SZ, vtcr->tsz) |
+                         FIELD_PREP(STRTAB_STE_2_VTCR_S2SL0, vtcr->sl) |
+                         FIELD_PREP(STRTAB_STE_2_VTCR_S2IR0, vtcr->irgn) |
+                         FIELD_PREP(STRTAB_STE_2_VTCR_S2OR0, vtcr->orgn) |
+                         FIELD_PREP(STRTAB_STE_2_VTCR_S2SH0, vtcr->sh) |
+                         FIELD_PREP(STRTAB_STE_2_VTCR_S2TG, vtcr->tg) |
+                         FIELD_PREP(STRTAB_STE_2_VTCR_S2PS, vtcr->ps);
        return 0;
 }
 
-static int arm_smmu_domain_finalise(struct iommu_domain *domain)
+static int arm_smmu_domain_finalise(struct iommu_domain *domain,
+                                   struct arm_smmu_master *master)
 {
        int ret;
        unsigned long ias, oas;
@@ -2205,6 +2460,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain)
        struct io_pgtable_cfg pgtbl_cfg;
        struct io_pgtable_ops *pgtbl_ops;
        int (*finalise_stage_fn)(struct arm_smmu_domain *,
+                                struct arm_smmu_master *,
                                 struct io_pgtable_cfg *);
        struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
        struct arm_smmu_device *smmu = smmu_domain->smmu;
@@ -2259,7 +2515,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain)
        domain->geometry.aperture_end = (1UL << pgtbl_cfg.ias) - 1;
        domain->geometry.force_aperture = true;
 
-       ret = finalise_stage_fn(smmu_domain, &pgtbl_cfg);
+       ret = finalise_stage_fn(smmu_domain, master, &pgtbl_cfg);
        if (ret < 0) {
                free_io_pgtable_ops(pgtbl_ops);
                return ret;
@@ -2412,7 +2668,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 
        if (!smmu_domain->smmu) {
                smmu_domain->smmu = smmu;
-               ret = arm_smmu_domain_finalise(domain);
+               ret = arm_smmu_domain_finalise(domain, master);
                if (ret) {
                        smmu_domain->smmu = NULL;
                        goto out_unlock;
@@ -2424,6 +2680,13 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
                        dev_name(smmu->dev));
                ret = -ENXIO;
                goto out_unlock;
+       } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 &&
+                  master->ssid_bits != smmu_domain->s1_cfg.s1cdmax) {
+               dev_err(dev,
+                       "cannot attach to incompatible domain (%u SSID bits != %u)\n",
+                       smmu_domain->s1_cfg.s1cdmax, master->ssid_bits);
+               ret = -EINVAL;
+               goto out_unlock;
        }
 
        master->domain = smmu_domain;
@@ -2431,9 +2694,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
        if (smmu_domain->stage != ARM_SMMU_DOMAIN_BYPASS)
                master->ats_enabled = arm_smmu_ats_supported(master);
 
-       if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1)
-               arm_smmu_write_ctx_desc(smmu, &smmu_domain->s1_cfg);
-
        arm_smmu_install_ste_for_dev(master);
 
        spin_lock_irqsave(&smmu_domain->devices_lock, flags);
@@ -2534,51 +2794,66 @@ static int arm_smmu_add_device(struct device *dev)
 
        if (!fwspec || fwspec->ops != &arm_smmu_ops)
                return -ENODEV;
-       /*
-        * We _can_ actually withstand dodgy bus code re-calling add_device()
-        * without an intervening remove_device()/of_xlate() sequence, but
-        * we're not going to do so quietly...
-        */
-       if (WARN_ON_ONCE(fwspec->iommu_priv)) {
-               master = fwspec->iommu_priv;
-               smmu = master->smmu;
-       } else {
-               smmu = arm_smmu_get_by_fwnode(fwspec->iommu_fwnode);
-               if (!smmu)
-                       return -ENODEV;
-               master = kzalloc(sizeof(*master), GFP_KERNEL);
-               if (!master)
-                       return -ENOMEM;
 
-               master->dev = dev;
-               master->smmu = smmu;
-               master->sids = fwspec->ids;
-               master->num_sids = fwspec->num_ids;
-               fwspec->iommu_priv = master;
-       }
+       if (WARN_ON_ONCE(fwspec->iommu_priv))
+               return -EBUSY;
+
+       smmu = arm_smmu_get_by_fwnode(fwspec->iommu_fwnode);
+       if (!smmu)
+               return -ENODEV;
+
+       master = kzalloc(sizeof(*master), GFP_KERNEL);
+       if (!master)
+               return -ENOMEM;
+
+       master->dev = dev;
+       master->smmu = smmu;
+       master->sids = fwspec->ids;
+       master->num_sids = fwspec->num_ids;
+       fwspec->iommu_priv = master;
 
        /* Check the SIDs are in range of the SMMU and our stream table */
        for (i = 0; i < master->num_sids; i++) {
                u32 sid = master->sids[i];
 
-               if (!arm_smmu_sid_in_range(smmu, sid))
-                       return -ERANGE;
+               if (!arm_smmu_sid_in_range(smmu, sid)) {
+                       ret = -ERANGE;
+                       goto err_free_master;
+               }
 
                /* Ensure l2 strtab is initialised */
                if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) {
                        ret = arm_smmu_init_l2_strtab(smmu, sid);
                        if (ret)
-                               return ret;
+                               goto err_free_master;
                }
        }
 
+       master->ssid_bits = min(smmu->ssid_bits, fwspec->num_pasid_bits);
+
+       if (!(smmu->features & ARM_SMMU_FEAT_2_LVL_CDTAB))
+               master->ssid_bits = min_t(u8, master->ssid_bits,
+                                         CTXDESC_LINEAR_CDMAX);
+
+       ret = iommu_device_link(&smmu->iommu, dev);
+       if (ret)
+               goto err_free_master;
+
        group = iommu_group_get_for_dev(dev);
-       if (!IS_ERR(group)) {
-               iommu_group_put(group);
-               iommu_device_link(&smmu->iommu, dev);
+       if (IS_ERR(group)) {
+               ret = PTR_ERR(group);
+               goto err_unlink;
        }
 
-       return PTR_ERR_OR_ZERO(group);
+       iommu_group_put(group);
+       return 0;
+
+err_unlink:
+       iommu_device_unlink(&smmu->iommu, dev);
+err_free_master:
+       kfree(master);
+       fwspec->iommu_priv = NULL;
+       return ret;
 }
 
 static void arm_smmu_remove_device(struct device *dev)
@@ -2710,15 +2985,6 @@ static void arm_smmu_get_resv_regions(struct device *dev,
        iommu_dma_get_resv_regions(dev, head);
 }
 
-static void arm_smmu_put_resv_regions(struct device *dev,
-                                     struct list_head *head)
-{
-       struct iommu_resv_region *entry, *next;
-
-       list_for_each_entry_safe(entry, next, head, list)
-               kfree(entry);
-}
-
 static struct iommu_ops arm_smmu_ops = {
        .capable                = arm_smmu_capable,
        .domain_alloc           = arm_smmu_domain_alloc,
@@ -2736,7 +3002,7 @@ static struct iommu_ops arm_smmu_ops = {
        .domain_set_attr        = arm_smmu_domain_set_attr,
        .of_xlate               = arm_smmu_of_xlate,
        .get_resv_regions       = arm_smmu_get_resv_regions,
-       .put_resv_regions       = arm_smmu_put_resv_regions,
+       .put_resv_regions       = generic_iommu_put_resv_regions,
        .pgsize_bitmap          = -1UL, /* Restricted during device attach */
 };
 
@@ -2883,7 +3149,7 @@ static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu)
 
        l1size = cfg->num_l1_ents * (STRTAB_L1_DESC_DWORDS << 3);
        strtab = dmam_alloc_coherent(smmu->dev, l1size, &cfg->strtab_dma,
-                                    GFP_KERNEL | __GFP_ZERO);
+                                    GFP_KERNEL);
        if (!strtab) {
                dev_err(smmu->dev,
                        "failed to allocate l1 stream table (%u bytes)\n",
@@ -2910,7 +3176,7 @@ static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
 
        size = (1 << smmu->sid_bits) * (STRTAB_STE_DWORDS << 3);
        strtab = dmam_alloc_coherent(smmu->dev, size, &cfg->strtab_dma,
-                                    GFP_KERNEL | __GFP_ZERO);
+                                    GFP_KERNEL);
        if (!strtab) {
                dev_err(smmu->dev,
                        "failed to allocate linear stream table (%u bytes)\n",
@@ -3570,6 +3836,43 @@ static unsigned long arm_smmu_resource_size(struct arm_smmu_device *smmu)
                return SZ_128K;
 }
 
+static int arm_smmu_set_bus_ops(struct iommu_ops *ops)
+{
+       int err;
+
+#ifdef CONFIG_PCI
+       if (pci_bus_type.iommu_ops != ops) {
+               err = bus_set_iommu(&pci_bus_type, ops);
+               if (err)
+                       return err;
+       }
+#endif
+#ifdef CONFIG_ARM_AMBA
+       if (amba_bustype.iommu_ops != ops) {
+               err = bus_set_iommu(&amba_bustype, ops);
+               if (err)
+                       goto err_reset_pci_ops;
+       }
+#endif
+       if (platform_bus_type.iommu_ops != ops) {
+               err = bus_set_iommu(&platform_bus_type, ops);
+               if (err)
+                       goto err_reset_amba_ops;
+       }
+
+       return 0;
+
+err_reset_amba_ops:
+#ifdef CONFIG_ARM_AMBA
+       bus_set_iommu(&amba_bustype, NULL);
+#endif
+err_reset_pci_ops: __maybe_unused;
+#ifdef CONFIG_PCI
+       bus_set_iommu(&pci_bus_type, NULL);
+#endif
+       return err;
+}
+
 static int arm_smmu_device_probe(struct platform_device *pdev)
 {
        int irq, ret;
@@ -3599,7 +3902,7 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
 
        /* Base address */
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       if (resource_size(res) + 1 < arm_smmu_resource_size(smmu)) {
+       if (resource_size(res) < arm_smmu_resource_size(smmu)) {
                dev_err(dev, "MMIO region too small (%pr)\n", res);
                return -EINVAL;
        }
@@ -3660,48 +3963,45 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
                return ret;
        }
 
-#ifdef CONFIG_PCI
-       if (pci_bus_type.iommu_ops != &arm_smmu_ops) {
-               pci_request_acs();
-               ret = bus_set_iommu(&pci_bus_type, &arm_smmu_ops);
-               if (ret)
-                       return ret;
-       }
-#endif
-#ifdef CONFIG_ARM_AMBA
-       if (amba_bustype.iommu_ops != &arm_smmu_ops) {
-               ret = bus_set_iommu(&amba_bustype, &arm_smmu_ops);
-               if (ret)
-                       return ret;
-       }
-#endif
-       if (platform_bus_type.iommu_ops != &arm_smmu_ops) {
-               ret = bus_set_iommu(&platform_bus_type, &arm_smmu_ops);
-               if (ret)
-                       return ret;
-       }
-       return 0;
+       return arm_smmu_set_bus_ops(&arm_smmu_ops);
 }
 
-static void arm_smmu_device_shutdown(struct platform_device *pdev)
+static int arm_smmu_device_remove(struct platform_device *pdev)
 {
        struct arm_smmu_device *smmu = platform_get_drvdata(pdev);
 
+       arm_smmu_set_bus_ops(NULL);
+       iommu_device_unregister(&smmu->iommu);
+       iommu_device_sysfs_remove(&smmu->iommu);
        arm_smmu_device_disable(smmu);
+
+       return 0;
+}
+
+static void arm_smmu_device_shutdown(struct platform_device *pdev)
+{
+       arm_smmu_device_remove(pdev);
 }
 
 static const struct of_device_id arm_smmu_of_match[] = {
        { .compatible = "arm,smmu-v3", },
        { },
 };
+MODULE_DEVICE_TABLE(of, arm_smmu_of_match);
 
 static struct platform_driver arm_smmu_driver = {
        .driver = {
-               .name           = "arm-smmu-v3",
-               .of_match_table = of_match_ptr(arm_smmu_of_match),
-               .suppress_bind_attrs = true,
+               .name                   = "arm-smmu-v3",
+               .of_match_table         = arm_smmu_of_match,
+               .suppress_bind_attrs    = true,
        },
        .probe  = arm_smmu_device_probe,
+       .remove = arm_smmu_device_remove,
        .shutdown = arm_smmu_device_shutdown,
 };
-builtin_platform_driver(arm_smmu_driver);
+module_platform_driver(arm_smmu_driver);
+
+MODULE_DESCRIPTION("IOMMU API for ARM architected SMMUv3 implementations");
+MODULE_AUTHOR("Will Deacon <will@kernel.org>");
+MODULE_ALIAS("platform:arm-smmu-v3");
+MODULE_LICENSE("GPL v2");