Merge tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 6 Jul 2017 00:09:27 +0000 (17:09 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 6 Jul 2017 00:09:27 +0000 (17:09 -0700)
Pull arm64 updates from Will Deacon:

 - RAS reporting via GHES/APEI (ACPI)

 - Indirect ftrace trampolines for modules

 - Improvements to kernel fault reporting

 - Page poisoning

 - Sigframe cleanups and preparation for SVE context

 - Core dump fixes

 - Sparse fixes (mainly relating to endianness)

 - xgene SoC PMU v3 driver

 - Misc cleanups and non-critical fixes

* tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux: (75 commits)
  arm64: fix endianness annotation for 'struct jit_ctx' and friends
  arm64: cpuinfo: constify attribute_group structures.
  arm64: ptrace: Fix incorrect get_user() use in compat_vfp_set()
  arm64: ptrace: Remove redundant overrun check from compat_vfp_set()
  arm64: ptrace: Avoid setting compat FP[SC]R to garbage if get_user fails
  arm64: fix endianness annotation for __apply_alternatives()/get_alt_insn()
  arm64: fix endianness annotation in get_kaslr_seed()
  arm64: add missing conversion to __wsum in ip_fast_csum()
  arm64: fix endianness annotation in acpi_parking_protocol.c
  arm64: use readq() instead of readl() to read 64bit entry_point
  arm64: fix endianness annotation for reloc_insn_movw() & reloc_insn_imm()
  arm64: fix endianness annotation for aarch64_insn_write()
  arm64: fix endianness annotation in aarch64_insn_read()
  arm64: fix endianness annotation in call_undef_hook()
  arm64: fix endianness annotation for debug-monitors.c
  ras: mark stub functions as 'inline'
  arm64: pass endianness info to sparse
  arm64: ftrace: fix !CONFIG_ARM64_MODULE_PLTS kernels
  arm64: signal: Allow expansion of the signal frame
  acpi: apei: check for pending errors when probing GHES entries
  ...

62 files changed:
arch/arm/include/asm/kvm_arm.h
arch/arm/include/asm/system_misc.h
arch/arm/kernel/perf_event_v6.c
arch/arm64/Kconfig
arch/arm64/Makefile
arch/arm64/include/asm/checksum.h
arch/arm64/include/asm/dma-mapping.h
arch/arm64/include/asm/elf.h
arch/arm64/include/asm/esr.h
arch/arm64/include/asm/futex.h
arch/arm64/include/asm/kvm_arm.h
arch/arm64/include/asm/module.h
arch/arm64/include/asm/pgtable.h
arch/arm64/include/asm/processor.h
arch/arm64/include/asm/stacktrace.h
arch/arm64/include/asm/system_misc.h
arch/arm64/include/uapi/asm/sigcontext.h
arch/arm64/kernel/Makefile
arch/arm64/kernel/acpi_parking_protocol.c
arch/arm64/kernel/alternative.c
arch/arm64/kernel/cpufeature.c
arch/arm64/kernel/cpuinfo.c
arch/arm64/kernel/debug-monitors.c
arch/arm64/kernel/ftrace-mod.S [new file with mode: 0644]
arch/arm64/kernel/ftrace.c
arch/arm64/kernel/insn.c
arch/arm64/kernel/kaslr.c
arch/arm64/kernel/module.c
arch/arm64/kernel/pci.c
arch/arm64/kernel/perf_event.c
arch/arm64/kernel/probes/kprobes.c
arch/arm64/kernel/process.c
arch/arm64/kernel/ptrace.c
arch/arm64/kernel/setup.c
arch/arm64/kernel/signal.c
arch/arm64/kernel/stacktrace.c
arch/arm64/kernel/traps.c
arch/arm64/kernel/vdso.c
arch/arm64/mm/dma-mapping.c
arch/arm64/mm/fault.c
arch/arm64/mm/hugetlbpage.c
arch/arm64/mm/mmap.c
arch/arm64/mm/mmu.c
arch/arm64/net/bpf_jit_comp.c
drivers/acpi/apei/Kconfig
drivers/acpi/apei/ghes.c
drivers/acpi/apei/hest.c
drivers/acpi/arm64/iort.c
drivers/char/Kconfig
drivers/firmware/efi/cper.c
drivers/irqchip/Kconfig
drivers/perf/Kconfig
drivers/perf/xgene_pmu.c
drivers/ras/ras.c
fs/proc/kcore.c
include/acpi/ghes.h
include/linux/acpi_iort.h
include/linux/cper.h
include/linux/ras.h
include/linux/uuid.h
include/ras/ras_event.h
virt/kvm/arm/mmu.c

index a3f0b3d..ebf020b 100644 (file)
 #define FSC_FAULT      (0x04)
 #define FSC_ACCESS     (0x08)
 #define FSC_PERM       (0x0c)
+#define FSC_SEA                (0x10)
+#define FSC_SEA_TTW0   (0x14)
+#define FSC_SEA_TTW1   (0x15)
+#define FSC_SEA_TTW2   (0x16)
+#define FSC_SEA_TTW3   (0x17)
+#define FSC_SECC       (0x18)
+#define FSC_SECC_TTW0  (0x1c)
+#define FSC_SECC_TTW1  (0x1d)
+#define FSC_SECC_TTW2  (0x1e)
+#define FSC_SECC_TTW3  (0x1f)
 
 /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
 #define HPFAR_MASK     (~0xf)
index a3d61ad..8c4a89f 100644 (file)
@@ -22,6 +22,11 @@ extern void (*arm_pm_idle)(void);
 
 extern unsigned int user_debug;
 
+static inline int handle_guest_sea(phys_addr_t addr, unsigned int esr)
+{
+       return -1;
+}
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __ASM_ARM_SYSTEM_MISC_H */
index 96b7a47..8226d0b 100644 (file)
@@ -552,7 +552,7 @@ static int armv6mpcore_pmu_init(struct arm_pmu *cpu_pmu)
        return 0;
 }
 
-static struct of_device_id armv6_pmu_of_device_ids[] = {
+static const struct of_device_id armv6_pmu_of_device_ids[] = {
        {.compatible = "arm,arm11mpcore-pmu",   .data = armv6mpcore_pmu_init},
        {.compatible = "arm,arm1176-pmu",       .data = armv6_1176_pmu_init},
        {.compatible = "arm,arm1136-pmu",       .data = armv6_1136_pmu_init},
index 300146d..9f7a934 100644 (file)
@@ -3,6 +3,7 @@ config ARM64
        select ACPI_CCA_REQUIRED if ACPI
        select ACPI_GENERIC_GSI if ACPI
        select ACPI_GTDT if ACPI
+       select ACPI_IORT if ACPI
        select ACPI_REDUCED_HARDWARE_ONLY if ACPI
        select ACPI_MCFG if ACPI
        select ACPI_SPCR_TABLE if ACPI
@@ -19,7 +20,9 @@ config ARM64
        select ARCH_HAS_STRICT_KERNEL_RWX
        select ARCH_HAS_STRICT_MODULE_RWX
        select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
+       select ARCH_HAVE_NMI_SAFE_CMPXCHG if ACPI_APEI_SEA
        select ARCH_USE_CMPXCHG_LOCKREF
+       select ARCH_SUPPORTS_MEMORY_FAILURE
        select ARCH_SUPPORTS_ATOMIC_RMW
        select ARCH_SUPPORTS_NUMA_BALANCING
        select ARCH_WANT_COMPAT_IPC_PARSE_VERSION
@@ -93,6 +96,7 @@ config ARM64
        select HAVE_IRQ_TIME_ACCOUNTING
        select HAVE_MEMBLOCK
        select HAVE_MEMBLOCK_NODE_MAP if NUMA
+       select HAVE_NMI if ACPI_APEI_SEA
        select HAVE_PATA_PLATFORM
        select HAVE_PERF_EVENTS
        select HAVE_PERF_REGS
@@ -245,6 +249,9 @@ config PGTABLE_LEVELS
 config ARCH_SUPPORTS_UPROBES
        def_bool y
 
+config ARCH_PROC_KCORE_TEXT
+       def_bool y
+
 source "init/Kconfig"
 
 source "kernel/Kconfig.freezer"
@@ -983,7 +990,7 @@ config RANDOMIZE_BASE
 
 config RANDOMIZE_MODULE_REGION_FULL
        bool "Randomize the module region independently from the core kernel"
-       depends on RANDOMIZE_BASE && !DYNAMIC_FTRACE
+       depends on RANDOMIZE_BASE
        default y
        help
          Randomizes the location of the module region without considering the
index f839ecd..9b41f1e 100644 (file)
@@ -52,17 +52,19 @@ KBUILD_AFLAGS       += $(lseinstr) $(brokengasinst)
 
 ifeq ($(CONFIG_CPU_BIG_ENDIAN), y)
 KBUILD_CPPFLAGS        += -mbig-endian
+CHECKFLAGS     += -D__AARCH64EB__
 AS             += -EB
 LD             += -EB
 UTS_MACHINE    := aarch64_be
 else
 KBUILD_CPPFLAGS        += -mlittle-endian
+CHECKFLAGS     += -D__AARCH64EL__
 AS             += -EL
 LD             += -EL
 UTS_MACHINE    := aarch64
 endif
 
-CHECKFLAGS     += -D__aarch64__
+CHECKFLAGS     += -D__aarch64__ -m64
 
 ifeq ($(CONFIG_ARM64_MODULE_CMODEL_LARGE), y)
 KBUILD_CFLAGS_MODULE   += -mcmodel=large
@@ -70,6 +72,9 @@ endif
 
 ifeq ($(CONFIG_ARM64_MODULE_PLTS),y)
 KBUILD_LDFLAGS_MODULE  += -T $(srctree)/arch/arm64/kernel/module.lds
+ifeq ($(CONFIG_DYNAMIC_FTRACE),y)
+KBUILD_LDFLAGS_MODULE  += $(objtree)/arch/arm64/kernel/ftrace-mod.o
+endif
 endif
 
 # Default value
index 09f6533..0b6f5a7 100644 (file)
@@ -42,7 +42,7 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
        } while (--ihl);
 
        sum += ((sum >> 32) | (sum << 32));
-       return csum_fold(sum >> 32);
+       return csum_fold((__force u32)(sum >> 32));
 }
 #define ip_fast_csum ip_fast_csum
 
index 5392dbe..f72779a 100644 (file)
@@ -48,8 +48,6 @@ void arch_teardown_dma_ops(struct device *dev);
 /* do not use this function in a driver */
 static inline bool is_device_dma_coherent(struct device *dev)
 {
-       if (!dev)
-               return false;
        return dev->archdata.dma_coherent;
 }
 
index 5d17004..ac3fb74 100644 (file)
@@ -142,6 +142,7 @@ typedef struct user_fpsimd_state elf_fpregset_t;
 ({                                                                     \
        clear_bit(TIF_32BIT, &current->mm->context.flags);              \
        clear_thread_flag(TIF_32BIT);                                   \
+       current->personality &= ~READ_IMPLIES_EXEC;                     \
 })
 
 /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
@@ -187,6 +188,11 @@ typedef compat_elf_greg_t          compat_elf_gregset_t[COMPAT_ELF_NGREG];
                                         ((x)->e_flags & EF_ARM_EABI_MASK))
 
 #define compat_start_thread            compat_start_thread
+/*
+ * Unlike the native SET_PERSONALITY macro, the compat version inherits
+ * READ_IMPLIES_EXEC across a fork() since this is the behaviour on
+ * arch/arm/.
+ */
 #define COMPAT_SET_PERSONALITY(ex)                                     \
 ({                                                                     \
        set_bit(TIF_32BIT, &current->mm->context.flags);                \
index 85997c0..28bf02e 100644 (file)
@@ -83,6 +83,7 @@
 #define ESR_ELx_WNR            (UL(1) << 6)
 
 /* Shared ISS field definitions for Data/Instruction aborts */
+#define ESR_ELx_FnV            (UL(1) << 10)
 #define ESR_ELx_EA             (UL(1) << 9)
 #define ESR_ELx_S1PTW          (UL(1) << 7)
 
index 85c4a89..f32b42e 100644 (file)
@@ -48,16 +48,16 @@ do {                                                                        \
 } while (0)
 
 static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
 {
        int op = (encoded_op >> 28) & 7;
        int cmp = (encoded_op >> 24) & 15;
-       int oparg = (encoded_op << 8) >> 20;
-       int cmparg = (encoded_op << 20) >> 20;
+       int oparg = (int)(encoded_op << 8) >> 20;
+       int cmparg = (int)(encoded_op << 20) >> 20;
        int oldval = 0, ret, tmp;
 
        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-               oparg = 1 << oparg;
+               oparg = 1U << (oparg & 0x1f);
 
        if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
                return -EFAULT;
index 6e99978..61d694c 100644 (file)
 #define FSC_FAULT      ESR_ELx_FSC_FAULT
 #define FSC_ACCESS     ESR_ELx_FSC_ACCESS
 #define FSC_PERM       ESR_ELx_FSC_PERM
+#define FSC_SEA                ESR_ELx_FSC_EXTABT
+#define FSC_SEA_TTW0   (0x14)
+#define FSC_SEA_TTW1   (0x15)
+#define FSC_SEA_TTW2   (0x16)
+#define FSC_SEA_TTW3   (0x17)
+#define FSC_SECC       (0x18)
+#define FSC_SECC_TTW0  (0x1c)
+#define FSC_SECC_TTW1  (0x1d)
+#define FSC_SECC_TTW2  (0x1e)
+#define FSC_SECC_TTW3  (0x1f)
 
 /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
 #define HPFAR_MASK     (~UL(0xf))
index d57693f..19bd976 100644 (file)
@@ -30,6 +30,9 @@ struct mod_plt_sec {
 struct mod_arch_specific {
        struct mod_plt_sec      core;
        struct mod_plt_sec      init;
+
+       /* for CONFIG_DYNAMIC_FTRACE */
+       void                    *ftrace_trampoline;
 };
 #endif
 
index c213fdb..6eae342 100644 (file)
@@ -441,7 +441,7 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd)
 
 #define pud_none(pud)          (!pud_val(pud))
 #define pud_bad(pud)           (!(pud_val(pud) & PUD_TABLE_BIT))
-#define pud_present(pud)       (pud_val(pud))
+#define pud_present(pud)       pte_present(pud_pte(pud))
 
 static inline void set_pud(pud_t *pudp, pud_t pud)
 {
index 9428b93..64c9e78 100644 (file)
@@ -104,6 +104,9 @@ struct thread_struct {
 #define task_user_tls(t)       (&(t)->thread.tp_value)
 #endif
 
+/* Sync TPIDR_EL0 back to thread_struct for current */
+void tls_preserve_current_state(void);
+
 #define INIT_THREAD  { }
 
 static inline void start_thread_common(struct pt_regs *regs, unsigned long pc)
index 801a16d..5b6eafc 100644 (file)
@@ -30,5 +30,6 @@ struct stackframe {
 extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame);
 extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
                            int (*fn)(struct stackframe *, void *), void *data);
+extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk);
 
 #endif /* __ASM_STACKTRACE_H */
index bc81243..07aa8e3 100644 (file)
@@ -40,7 +40,7 @@ void hook_debug_fault_code(int nr, int (*fn)(unsigned long, unsigned int,
                           int sig, int code, const char *name);
 
 struct mm_struct;
-extern void show_pte(struct mm_struct *mm, unsigned long addr);
+extern void show_pte(unsigned long addr);
 extern void __show_regs(struct pt_regs *);
 
 extern void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd);
@@ -56,6 +56,8 @@ extern void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd);
        __show_ratelimited;                                             \
 })
 
+int handle_guest_sea(phys_addr_t addr, unsigned int esr);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __ASM_SYSTEM_MISC_H */
index ee469be..f0a76b9 100644 (file)
@@ -33,6 +33,26 @@ struct sigcontext {
        __u8 __reserved[4096] __attribute__((__aligned__(16)));
 };
 
+/*
+ * Allocation of __reserved[]:
+ * (Note: records do not necessarily occur in the order shown here.)
+ *
+ *     size            description
+ *
+ *     0x210           fpsimd_context
+ *      0x10           esr_context
+ *      0x20           extra_context (optional)
+ *      0x10           terminator (null _aarch64_ctx)
+ *
+ *     0xdb0           (reserved for future allocation)
+ *
+ * New records that can exceed this space need to be opt-in for userspace, so
+ * that an expanded signal frame is not generated unexpectedly.  The mechanism
+ * for opting in will depend on the extension that generates each new record.
+ * The above table documents the maximum set and sizes of records than can be
+ * generated when userspace does not opt in for any such extension.
+ */
+
 /*
  * Header to be used at the beginning of structures extending the user
  * context. Such structures must be placed after the rt_sigframe on the stack
@@ -61,4 +81,39 @@ struct esr_context {
        __u64 esr;
 };
 
+/*
+ * extra_context: describes extra space in the signal frame for
+ * additional structures that don't fit in sigcontext.__reserved[].
+ *
+ * Note:
+ *
+ * 1) fpsimd_context, esr_context and extra_context must be placed in
+ * sigcontext.__reserved[] if present.  They cannot be placed in the
+ * extra space.  Any other record can be placed either in the extra
+ * space or in sigcontext.__reserved[], unless otherwise specified in
+ * this file.
+ *
+ * 2) There must not be more than one extra_context.
+ *
+ * 3) If extra_context is present, it must be followed immediately in
+ * sigcontext.__reserved[] by the terminating null _aarch64_ctx.
+ *
+ * 4) The extra space to which datap points must start at the first
+ * 16-byte aligned address immediately after the terminating null
+ * _aarch64_ctx that follows the extra_context structure in
+ * __reserved[].  The extra space may overrun the end of __reserved[],
+ * as indicated by a sufficiently large value for the size field.
+ *
+ * 5) The extra space must itself be terminated with a null
+ * _aarch64_ctx.
+ */
+#define EXTRA_MAGIC    0x45585401
+
+struct extra_context {
+       struct _aarch64_ctx head;
+       __u64 datap; /* 16-byte aligned pointer to extra space cast to __u64 */
+       __u32 size; /* size in bytes of the extra space */
+       __u32 __reserved[3];
+};
+
 #endif /* _UAPI__ASM_SIGCONTEXT_H */
index 1dcb69d..f2b4e81 100644 (file)
@@ -62,3 +62,6 @@ extra-y                                       += $(head-y) vmlinux.lds
 ifeq ($(CONFIG_DEBUG_EFI),y)
 AFLAGS_head.o += -DVMLINUX_PATH="\"$(realpath $(objtree)/vmlinux)\""
 endif
+
+# will be included by each individual module but not by the core kernel itself
+extra-$(CONFIG_DYNAMIC_FTRACE) += ftrace-mod.o
index 1f5655c..98a20e5 100644 (file)
@@ -71,7 +71,7 @@ static int acpi_parking_protocol_cpu_boot(unsigned int cpu)
 {
        struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu];
        struct parking_protocol_mailbox __iomem *mailbox;
-       __le32 cpu_id;
+       u32 cpu_id;
 
        /*
         * Map mailbox memory with attribute device nGnRE (ie ioremap -
@@ -123,9 +123,9 @@ static void acpi_parking_protocol_cpu_postboot(void)
        int cpu = smp_processor_id();
        struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu];
        struct parking_protocol_mailbox __iomem *mailbox = cpu_entry->mailbox;
-       __le64 entry_point;
+       u64 entry_point;
 
-       entry_point = readl_relaxed(&mailbox->entry_point);
+       entry_point = readq_relaxed(&mailbox->entry_point);
        /*
         * Check if firmware has cleared the entry_point as expected
         * by the protocol specification.
index 8840c10..6dd0a3a 100644 (file)
@@ -28,7 +28,7 @@
 #include <asm/sections.h>
 #include <linux/stop_machine.h>
 
-#define __ALT_PTR(a,f)         (u32 *)((void *)&(a)->f + (a)->f)
+#define __ALT_PTR(a,f)         ((void *)&(a)->f + (a)->f)
 #define ALT_ORIG_PTR(a)                __ALT_PTR(a, orig_offset)
 #define ALT_REPL_PTR(a)                __ALT_PTR(a, alt_offset)
 
@@ -60,7 +60,7 @@ static bool branch_insn_requires_update(struct alt_instr *alt, unsigned long pc)
 
 #define align_down(x, a)       ((unsigned long)(x) & ~(((unsigned long)(a)) - 1))
 
-static u32 get_alt_insn(struct alt_instr *alt, u32 *insnptr, u32 *altinsnptr)
+static u32 get_alt_insn(struct alt_instr *alt, __le32 *insnptr, __le32 *altinsnptr)
 {
        u32 insn;
 
@@ -109,7 +109,7 @@ static void __apply_alternatives(void *alt_region, bool use_linear_alias)
 {
        struct alt_instr *alt;
        struct alt_region *region = alt_region;
-       u32 *origptr, *replptr, *updptr;
+       __le32 *origptr, *replptr, *updptr;
 
        for (alt = region->begin; alt < region->end; alt++) {
                u32 insn;
@@ -124,7 +124,7 @@ static void __apply_alternatives(void *alt_region, bool use_linear_alias)
 
                origptr = ALT_ORIG_PTR(alt);
                replptr = ALT_REPL_PTR(alt);
-               updptr = use_linear_alias ? (u32 *)lm_alias(origptr) : origptr;
+               updptr = use_linear_alias ? lm_alias(origptr) : origptr;
                nr_inst = alt->alt_len / sizeof(insn);
 
                for (i = 0; i < nr_inst; i++) {
index 817ce33..9f9e006 100644 (file)
@@ -51,6 +51,25 @@ unsigned int compat_elf_hwcap2 __read_mostly;
 DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS);
 EXPORT_SYMBOL(cpu_hwcaps);
 
+static int dump_cpu_hwcaps(struct notifier_block *self, unsigned long v, void *p)
+{
+       /* file-wide pr_fmt adds "CPU features: " prefix */
+       pr_emerg("0x%*pb\n", ARM64_NCAPS, &cpu_hwcaps);
+       return 0;
+}
+
+static struct notifier_block cpu_hwcaps_notifier = {
+       .notifier_call = dump_cpu_hwcaps
+};
+
+static int __init register_cpu_hwcaps_dumper(void)
+{
+       atomic_notifier_chain_register(&panic_notifier_list,
+                                      &cpu_hwcaps_notifier);
+       return 0;
+}
+__initcall(register_cpu_hwcaps_dumper);
+
 DEFINE_STATIC_KEY_ARRAY_FALSE(cpu_hwcap_keys, ARM64_NCAPS);
 EXPORT_SYMBOL(cpu_hwcap_keys);
 
@@ -639,8 +658,10 @@ void update_cpu_features(int cpu,
         * Mismatched CPU features are a recipe for disaster. Don't even
         * pretend to support them.
         */
-       WARN_TAINT_ONCE(taint, TAINT_CPU_OUT_OF_SPEC,
-                       "Unsupported CPU feature variation.\n");
+       if (taint) {
+               pr_warn_once("Unsupported CPU feature variation detected.\n");
+               add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+       }
 }
 
 u64 read_sanitised_ftr_reg(u32 id)
index 68b1f36..f495ee5 100644 (file)
@@ -227,7 +227,7 @@ static struct attribute *cpuregs_id_attrs[] = {
        NULL
 };
 
-static struct attribute_group cpuregs_attr_group = {
+static const struct attribute_group cpuregs_attr_group = {
        .attrs = cpuregs_id_attrs,
        .name = "identification"
 };
index d618e25..c7ef999 100644 (file)
@@ -341,20 +341,22 @@ int aarch32_break_handler(struct pt_regs *regs)
 
        if (compat_thumb_mode(regs)) {
                /* get 16-bit Thumb instruction */
-               get_user(thumb_instr, (u16 __user *)pc);
-               thumb_instr = le16_to_cpu(thumb_instr);
+               __le16 instr;
+               get_user(instr, (__le16 __user *)pc);
+               thumb_instr = le16_to_cpu(instr);
                if (thumb_instr == AARCH32_BREAK_THUMB2_LO) {
                        /* get second half of 32-bit Thumb-2 instruction */
-                       get_user(thumb_instr, (u16 __user *)(pc + 2));
-                       thumb_instr = le16_to_cpu(thumb_instr);
+                       get_user(instr, (__le16 __user *)(pc + 2));
+                       thumb_instr = le16_to_cpu(instr);
                        bp = thumb_instr == AARCH32_BREAK_THUMB2_HI;
                } else {
                        bp = thumb_instr == AARCH32_BREAK_THUMB;
                }
        } else {
                /* 32-bit ARM instruction */
-               get_user(arm_instr, (u32 __user *)pc);
-               arm_instr = le32_to_cpu(arm_instr);
+               __le32 instr;
+               get_user(instr, (__le32 __user *)pc);
+               arm_instr = le32_to_cpu(instr);
                bp = (arm_instr & ~0xf0000000) == AARCH32_BREAK_ARM;
        }
 
diff --git a/arch/arm64/kernel/ftrace-mod.S b/arch/arm64/kernel/ftrace-mod.S
new file mode 100644 (file)
index 0000000..00c4025
--- /dev/null
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+       .section        ".text.ftrace_trampoline", "ax"
+       .align          3
+0:     .quad           0
+__ftrace_trampoline:
+       ldr             x16, 0b
+       br              x16
+ENDPROC(__ftrace_trampoline)
index 40ad08a..c13b1fc 100644 (file)
  */
 
 #include <linux/ftrace.h>
+#include <linux/module.h>
 #include <linux/swab.h>
 #include <linux/uaccess.h>
 
 #include <asm/cacheflush.h>
+#include <asm/debug-monitors.h>
 #include <asm/ftrace.h>
 #include <asm/insn.h>
 
@@ -70,6 +72,58 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 {
        unsigned long pc = rec->ip;
        u32 old, new;
+       long offset = (long)pc - (long)addr;
+
+       if (offset < -SZ_128M || offset >= SZ_128M) {
+#ifdef CONFIG_ARM64_MODULE_PLTS
+               unsigned long *trampoline;
+               struct module *mod;
+
+               /*
+                * On kernels that support module PLTs, the offset between the
+                * branch instruction and its target may legally exceed the
+                * range of an ordinary relative 'bl' opcode. In this case, we
+                * need to branch via a trampoline in the module.
+                *
+                * NOTE: __module_text_address() must be called with preemption
+                * disabled, but we can rely on ftrace_lock to ensure that 'mod'
+                * retains its validity throughout the remainder of this code.
+                */
+               preempt_disable();
+               mod = __module_text_address(pc);
+               preempt_enable();
+
+               if (WARN_ON(!mod))
+                       return -EINVAL;
+
+               /*
+                * There is only one ftrace trampoline per module. For now,
+                * this is not a problem since on arm64, all dynamic ftrace
+                * invocations are routed via ftrace_caller(). This will need
+                * to be revisited if support for multiple ftrace entry points
+                * is added in the future, but for now, the pr_err() below
+                * deals with a theoretical issue only.
+                */
+               trampoline = (unsigned long *)mod->arch.ftrace_trampoline;
+               if (trampoline[0] != addr) {
+                       if (trampoline[0] != 0) {
+                               pr_err("ftrace: far branches to multiple entry points unsupported inside a single module\n");
+                               return -EINVAL;
+                       }
+
+                       /* point the trampoline to our ftrace entry point */
+                       module_disable_ro(mod);
+                       trampoline[0] = addr;
+                       module_enable_ro(mod, true);
+
+                       /* update trampoline before patching in the branch */
+                       smp_wmb();
+               }
+               addr = (unsigned long)&trampoline[1];
+#else /* CONFIG_ARM64_MODULE_PLTS */
+               return -EINVAL;
+#endif /* CONFIG_ARM64_MODULE_PLTS */
+       }
 
        old = aarch64_insn_gen_nop();
        new = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK);
@@ -84,12 +138,55 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
                    unsigned long addr)
 {
        unsigned long pc = rec->ip;
-       u32 old, new;
+       bool validate = true;
+       u32 old = 0, new;
+       long offset = (long)pc - (long)addr;
+
+       if (offset < -SZ_128M || offset >= SZ_128M) {
+#ifdef CONFIG_ARM64_MODULE_PLTS
+               u32 replaced;
+
+               /*
+                * 'mod' is only set at module load time, but if we end up
+                * dealing with an out-of-range condition, we can assume it
+                * is due to a module being loaded far away from the kernel.
+                */
+               if (!mod) {
+                       preempt_disable();
+                       mod = __module_text_address(pc);
+                       preempt_enable();
+
+                       if (WARN_ON(!mod))
+                               return -EINVAL;
+               }
+
+               /*
+                * The instruction we are about to patch may be a branch and
+                * link instruction that was redirected via a PLT entry. In
+                * this case, the normal validation will fail, but we can at
+                * least check that we are dealing with a branch and link
+                * instruction that points into the right module.
+                */
+               if (aarch64_insn_read((void *)pc, &replaced))
+                       return -EFAULT;
+
+               if (!aarch64_insn_is_bl(replaced) ||
+                   !within_module(pc + aarch64_get_branch_offset(replaced),
+                                  mod))
+                       return -EINVAL;
+
+               validate = false;
+#else /* CONFIG_ARM64_MODULE_PLTS */
+               return -EINVAL;
+#endif /* CONFIG_ARM64_MODULE_PLTS */
+       } else {
+               old = aarch64_insn_gen_branch_imm(pc, addr,
+                                                 AARCH64_INSN_BRANCH_LINK);
+       }
 
-       old = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK);
        new = aarch64_insn_gen_nop();
 
-       return ftrace_modify_code(pc, old, new, true);
+       return ftrace_modify_code(pc, old, new, validate);
 }
 
 void arch_ftrace_update_code(int command)
index cd87213..2718a77 100644 (file)
@@ -117,7 +117,7 @@ static void __kprobes patch_unmap(int fixmap)
 int __kprobes aarch64_insn_read(void *addr, u32 *insnp)
 {
        int ret;
-       u32 val;
+       __le32 val;
 
        ret = probe_kernel_read(&val, addr, AARCH64_INSN_SIZE);
        if (!ret)
@@ -126,7 +126,7 @@ int __kprobes aarch64_insn_read(void *addr, u32 *insnp)
        return ret;
 }
 
-static int __kprobes __aarch64_insn_write(void *addr, u32 insn)
+static int __kprobes __aarch64_insn_write(void *addr, __le32 insn)
 {
        void *waddr = addr;
        unsigned long flags = 0;
@@ -145,8 +145,7 @@ static int __kprobes __aarch64_insn_write(void *addr, u32 insn)
 
 int __kprobes aarch64_insn_write(void *addr, u32 insn)
 {
-       insn = cpu_to_le32(insn);
-       return __aarch64_insn_write(addr, insn);
+       return __aarch64_insn_write(addr, cpu_to_le32(insn));
 }
 
 static bool __kprobes __aarch64_insn_hotpatch_safe(u32 insn)
index d7e90d9..a9710ef 100644 (file)
@@ -27,7 +27,7 @@ u16 __initdata memstart_offset_seed;
 static __init u64 get_kaslr_seed(void *fdt)
 {
        int node, len;
-       u64 *prop;
+       fdt64_t *prop;
        u64 ret;
 
        node = fdt_path_offset(fdt, "/chosen");
index f035ff6..f469e04 100644 (file)
@@ -74,7 +74,7 @@ enum aarch64_reloc_op {
        RELOC_OP_PAGE,
 };
 
-static u64 do_reloc(enum aarch64_reloc_op reloc_op, void *place, u64 val)
+static u64 do_reloc(enum aarch64_reloc_op reloc_op, __le32 *place, u64 val)
 {
        switch (reloc_op) {
        case RELOC_OP_ABS:
@@ -121,12 +121,12 @@ enum aarch64_insn_movw_imm_type {
        AARCH64_INSN_IMM_MOVKZ,
 };
 
-static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val,
+static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val,
                           int lsb, enum aarch64_insn_movw_imm_type imm_type)
 {
        u64 imm;
        s64 sval;
-       u32 insn = le32_to_cpu(*(u32 *)place);
+       u32 insn = le32_to_cpu(*place);
 
        sval = do_reloc(op, place, val);
        imm = sval >> lsb;
@@ -154,7 +154,7 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val,
 
        /* Update the instruction with the new encoding. */
        insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm);
-       *(u32 *)place = cpu_to_le32(insn);
+       *place = cpu_to_le32(insn);
 
        if (imm > U16_MAX)
                return -ERANGE;
@@ -162,12 +162,12 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val,
        return 0;
 }
 
-static int reloc_insn_imm(enum aarch64_reloc_op op, void *place, u64 val,
+static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val,
                          int lsb, int len, enum aarch64_insn_imm_type imm_type)
 {
        u64 imm, imm_mask;
        s64 sval;
-       u32 insn = le32_to_cpu(*(u32 *)place);
+       u32 insn = le32_to_cpu(*place);
 
        /* Calculate the relocation value. */
        sval = do_reloc(op, place, val);
@@ -179,7 +179,7 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, void *place, u64 val,
 
        /* Update the instruction's immediate field. */
        insn = aarch64_insn_encode_immediate(imm_type, insn, imm);
-       *(u32 *)place = cpu_to_le32(insn);
+       *place = cpu_to_le32(insn);
 
        /*
         * Extract the upper value bits (including the sign bit) and
@@ -420,8 +420,12 @@ int module_finalize(const Elf_Ehdr *hdr,
        for (s = sechdrs, se = sechdrs + hdr->e_shnum; s < se; s++) {
                if (strcmp(".altinstructions", secstrs + s->sh_name) == 0) {
                        apply_alternatives((void *)s->sh_addr, s->sh_size);
-                       return 0;
                }
+#ifdef CONFIG_ARM64_MODULE_PLTS
+               if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE) &&
+                   !strcmp(".text.ftrace_trampoline", secstrs + s->sh_name))
+                       me->arch.ftrace_trampoline = (void *)s->sh_addr;
+#endif
        }
 
        return 0;
index c7e3e63..a7f6c01 100644 (file)
@@ -108,7 +108,10 @@ int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
        if (!acpi_disabled) {
                struct pci_config_window *cfg = bridge->bus->sysdata;
                struct acpi_device *adev = to_acpi_device(cfg->parent);
+               struct device *bus_dev = &bridge->bus->dev;
+
                ACPI_COMPANION_SET(&bridge->dev, adev);
+               set_dev_node(bus_dev, acpi_get_node(acpi_device_handle(adev)));
        }
 
        return 0;
index 83a1b1a..b5798ba 100644 (file)
@@ -529,7 +529,7 @@ static struct attribute_group armv8_pmuv3_events_attr_group = {
        .is_visible = armv8pmu_event_attr_is_visible,
 };
 
-PMU_FORMAT_ATTR(event, "config:0-9");
+PMU_FORMAT_ATTR(event, "config:0-15");
 
 static struct attribute *armv8_pmuv3_format_attrs[] = {
        &format_attr_event.attr,
index c5c4594..d849d98 100644 (file)
@@ -522,9 +522,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
                pr_err("current sp %lx does not match saved sp %lx\n",
                       orig_sp, stack_addr);
                pr_err("Saved registers for jprobe %p\n", jp);
-               show_regs(saved_regs);
+               __show_regs(saved_regs);
                pr_err("Current registers\n");
-               show_regs(regs);
+               __show_regs(regs);
                BUG();
        }
        unpause_graph_tracing();
index ae2a835..659ae80 100644 (file)
@@ -210,6 +210,7 @@ void __show_regs(struct pt_regs *regs)
 void show_regs(struct pt_regs * regs)
 {
        __show_regs(regs);
+       dump_backtrace(regs, NULL);
 }
 
 static void tls_thread_flush(void)
@@ -297,12 +298,16 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
        return 0;
 }
 
+void tls_preserve_current_state(void)
+{
+       *task_user_tls(current) = read_sysreg(tpidr_el0);
+}
+
 static void tls_thread_switch(struct task_struct *next)
 {
        unsigned long tpidr, tpidrro;
 
-       tpidr = read_sysreg(tpidr_el0);
-       *task_user_tls(current) = tpidr;
+       tls_preserve_current_state();
 
        tpidr = *task_user_tls(next);
        tpidrro = is_compat_thread(task_thread_info(next)) ?
index c142459..1b38c01 100644 (file)
@@ -623,6 +623,10 @@ static int fpr_get(struct task_struct *target, const struct user_regset *regset,
 {
        struct user_fpsimd_state *uregs;
        uregs = &target->thread.fpsimd_state.user_fpsimd;
+
+       if (target == current)
+               fpsimd_preserve_current_state();
+
        return user_regset_copyout(&pos, &count, &kbuf, &ubuf, uregs, 0, -1);
 }
 
@@ -648,6 +652,10 @@ static int tls_get(struct task_struct *target, const struct user_regset *regset,
                   void *kbuf, void __user *ubuf)
 {
        unsigned long *tls = &target->thread.tp_value;
+
+       if (target == current)
+               tls_preserve_current_state();
+
        return user_regset_copyout(&pos, &count, &kbuf, &ubuf, tls, 0, -1);
 }
 
@@ -894,21 +902,27 @@ static int compat_vfp_get(struct task_struct *target,
 {
        struct user_fpsimd_state *uregs;
        compat_ulong_t fpscr;
-       int ret;
+       int ret, vregs_end_pos;
 
        uregs = &target->thread.fpsimd_state.user_fpsimd;
 
+       if (target == current)
+               fpsimd_preserve_current_state();
+
        /*
         * The VFP registers are packed into the fpsimd_state, so they all sit
         * nicely together for us. We just need to create the fpscr separately.
         */
-       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, uregs, 0,
-                                 VFP_STATE_SIZE - sizeof(compat_ulong_t));
+       vregs_end_pos = VFP_STATE_SIZE - sizeof(compat_ulong_t);
+       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, uregs,
+                                 0, vregs_end_pos);
 
        if (count && !ret) {
                fpscr = (uregs->fpsr & VFP_FPSCR_STAT_MASK) |
                        (uregs->fpcr & VFP_FPSCR_CTRL_MASK);
-               ret = put_user(fpscr, (compat_ulong_t *)ubuf);
+
+               ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &fpscr,
+                                         vregs_end_pos, VFP_STATE_SIZE);
        }
 
        return ret;
@@ -921,20 +935,21 @@ static int compat_vfp_set(struct task_struct *target,
 {
        struct user_fpsimd_state *uregs;
        compat_ulong_t fpscr;
-       int ret;
-
-       if (pos + count > VFP_STATE_SIZE)
-               return -EIO;
+       int ret, vregs_end_pos;
 
        uregs = &target->thread.fpsimd_state.user_fpsimd;
 
+       vregs_end_pos = VFP_STATE_SIZE - sizeof(compat_ulong_t);
        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, uregs, 0,
-                                VFP_STATE_SIZE - sizeof(compat_ulong_t));
+                                vregs_end_pos);
 
        if (count && !ret) {
-               ret = get_user(fpscr, (compat_ulong_t *)ubuf);
-               uregs->fpsr = fpscr & VFP_FPSCR_STAT_MASK;
-               uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK;
+               ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fpscr,
+                                        vregs_end_pos, VFP_STATE_SIZE);
+               if (!ret) {
+                       uregs->fpsr = fpscr & VFP_FPSCR_STAT_MASK;
+                       uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK;
+               }
        }
 
        fpsimd_flush_task_state(target);
index 2c822ef..d4b7405 100644 (file)
@@ -194,6 +194,9 @@ static void __init setup_machine_fdt(phys_addr_t dt_phys)
        }
 
        name = of_flat_dt_get_machine_name();
+       if (!name)
+               return;
+
        pr_info("Machine model: %s\n", name);
        dump_stack_set_arch_desc("%s (DT)", name);
 }
index c7b6de6..089c374 100644 (file)
 
 #include <linux/compat.h>
 #include <linux/errno.h>
+#include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/personality.h>
 #include <linux/freezer.h>
+#include <linux/stddef.h>
 #include <linux/uaccess.h>
+#include <linux/sizes.h>
+#include <linux/string.h>
 #include <linux/tracehook.h>
 #include <linux/ratelimit.h>
 
 struct rt_sigframe {
        struct siginfo info;
        struct ucontext uc;
+};
+
+struct frame_record {
        u64 fp;
        u64 lr;
 };
 
+struct rt_sigframe_user_layout {
+       struct rt_sigframe __user *sigframe;
+       struct frame_record __user *next_frame;
+
+       unsigned long size;     /* size of allocated sigframe data */
+       unsigned long limit;    /* largest allowed size */
+
+       unsigned long fpsimd_offset;
+       unsigned long esr_offset;
+       unsigned long extra_offset;
+       unsigned long end_offset;
+};
+
+#define BASE_SIGFRAME_SIZE round_up(sizeof(struct rt_sigframe), 16)
+#define TERMINATOR_SIZE round_up(sizeof(struct _aarch64_ctx), 16)
+#define EXTRA_CONTEXT_SIZE round_up(sizeof(struct extra_context), 16)
+
+static void init_user_layout(struct rt_sigframe_user_layout *user)
+{
+       const size_t reserved_size =
+               sizeof(user->sigframe->uc.uc_mcontext.__reserved);
+
+       memset(user, 0, sizeof(*user));
+       user->size = offsetof(struct rt_sigframe, uc.uc_mcontext.__reserved);
+
+       user->limit = user->size + reserved_size;
+
+       user->limit -= TERMINATOR_SIZE;
+       user->limit -= EXTRA_CONTEXT_SIZE;
+       /* Reserve space for extension and terminator ^ */
+}
+
+static size_t sigframe_size(struct rt_sigframe_user_layout const *user)
+{
+       return round_up(max(user->size, sizeof(struct rt_sigframe)), 16);
+}
+
+/*
+ * Sanity limit on the approximate maximum size of signal frame we'll
+ * try to generate.  Stack alignment padding and the frame record are
+ * not taken into account.  This limit is not a guarantee and is
+ * NOT ABI.
+ */
+#define SIGFRAME_MAXSZ SZ_64K
+
+static int __sigframe_alloc(struct rt_sigframe_user_layout *user,
+                           unsigned long *offset, size_t size, bool extend)
+{
+       size_t padded_size = round_up(size, 16);
+
+       if (padded_size > user->limit - user->size &&
+           !user->extra_offset &&
+           extend) {
+               int ret;
+
+               user->limit += EXTRA_CONTEXT_SIZE;
+               ret = __sigframe_alloc(user, &user->extra_offset,
+                                      sizeof(struct extra_context), false);
+               if (ret) {
+                       user->limit -= EXTRA_CONTEXT_SIZE;
+                       return ret;
+               }
+
+               /* Reserve space for the __reserved[] terminator */
+               user->size += TERMINATOR_SIZE;
+
+               /*
+                * Allow expansion up to SIGFRAME_MAXSZ, ensuring space for
+                * the terminator:
+                */
+               user->limit = SIGFRAME_MAXSZ - TERMINATOR_SIZE;
+       }
+
+       /* Still not enough space?  Bad luck! */
+       if (padded_size > user->limit - user->size)
+               return -ENOMEM;
+
+       *offset = user->size;
+       user->size += padded_size;
+
+       return 0;
+}
+
+/*
+ * Allocate space for an optional record of <size> bytes in the user
+ * signal frame.  The offset from the signal frame base address to the
+ * allocated block is assigned to *offset.
+ */
+static int sigframe_alloc(struct rt_sigframe_user_layout *user,
+                         unsigned long *offset, size_t size)
+{
+       return __sigframe_alloc(user, offset, size, true);
+}
+
+/* Allocate the null terminator record and prevent further allocations */
+static int sigframe_alloc_end(struct rt_sigframe_user_layout *user)
+{
+       int ret;
+
+       /* Un-reserve the space reserved for the terminator: */
+       user->limit += TERMINATOR_SIZE;
+
+       ret = sigframe_alloc(user, &user->end_offset,
+                            sizeof(struct _aarch64_ctx));
+       if (ret)
+               return ret;
+
+       /* Prevent further allocation: */
+       user->limit = user->size;
+       return 0;
+}
+
+static void __user *apply_user_offset(
+       struct rt_sigframe_user_layout const *user, unsigned long offset)
+{
+       char __user *base = (char __user *)user->sigframe;
+
+       return base + offset;
+}
+
 static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
 {
        struct fpsimd_state *fpsimd = &current->thread.fpsimd_state;
@@ -92,12 +219,159 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx)
        return err ? -EFAULT : 0;
 }
 
+struct user_ctxs {
+       struct fpsimd_context __user *fpsimd;
+};
+
+static int parse_user_sigframe(struct user_ctxs *user,
+                              struct rt_sigframe __user *sf)
+{
+       struct sigcontext __user *const sc = &sf->uc.uc_mcontext;
+       struct _aarch64_ctx __user *head;
+       char __user *base = (char __user *)&sc->__reserved;
+       size_t offset = 0;
+       size_t limit = sizeof(sc->__reserved);
+       bool have_extra_context = false;
+       char const __user *const sfp = (char const __user *)sf;
+
+       user->fpsimd = NULL;
+
+       if (!IS_ALIGNED((unsigned long)base, 16))
+               goto invalid;
+
+       while (1) {
+               int err = 0;
+               u32 magic, size;
+               char const __user *userp;
+               struct extra_context const __user *extra;
+               u64 extra_datap;
+               u32 extra_size;
+               struct _aarch64_ctx const __user *end;
+               u32 end_magic, end_size;
+
+               if (limit - offset < sizeof(*head))
+                       goto invalid;
+
+               if (!IS_ALIGNED(offset, 16))
+                       goto invalid;
+
+               head = (struct _aarch64_ctx __user *)(base + offset);
+               __get_user_error(magic, &head->magic, err);
+               __get_user_error(size, &head->size, err);
+               if (err)
+                       return err;
+
+               if (limit - offset < size)
+                       goto invalid;
+
+               switch (magic) {
+               case 0:
+                       if (size)
+                               goto invalid;
+
+                       goto done;
+
+               case FPSIMD_MAGIC:
+                       if (user->fpsimd)
+                               goto invalid;
+
+                       if (size < sizeof(*user->fpsimd))
+                               goto invalid;
+
+                       user->fpsimd = (struct fpsimd_context __user *)head;
+                       break;
+
+               case ESR_MAGIC:
+                       /* ignore */
+                       break;
+
+               case EXTRA_MAGIC:
+                       if (have_extra_context)
+                               goto invalid;
+
+                       if (size < sizeof(*extra))
+                               goto invalid;
+
+                       userp = (char const __user *)head;
+
+                       extra = (struct extra_context const __user *)userp;
+                       userp += size;
+
+                       __get_user_error(extra_datap, &extra->datap, err);
+                       __get_user_error(extra_size, &extra->size, err);
+                       if (err)
+                               return err;
+
+                       /* Check for the dummy terminator in __reserved[]: */
+
+                       if (limit - offset - size < TERMINATOR_SIZE)
+                               goto invalid;
+
+                       end = (struct _aarch64_ctx const __user *)userp;
+                       userp += TERMINATOR_SIZE;
+
+                       __get_user_error(end_magic, &end->magic, err);
+                       __get_user_error(end_size, &end->size, err);
+                       if (err)
+                               return err;
+
+                       if (end_magic || end_size)
+                               goto invalid;
+
+                       /* Prevent looping/repeated parsing of extra_context */
+                       have_extra_context = true;
+
+                       base = (__force void __user *)extra_datap;
+                       if (!IS_ALIGNED((unsigned long)base, 16))
+                               goto invalid;
+
+                       if (!IS_ALIGNED(extra_size, 16))
+                               goto invalid;
+
+                       if (base != userp)
+                               goto invalid;
+
+                       /* Reject "unreasonably large" frames: */
+                       if (extra_size > sfp + SIGFRAME_MAXSZ - userp)
+                               goto invalid;
+
+                       /*
+                        * Ignore trailing terminator in __reserved[]
+                        * and start parsing extra data:
+                        */
+                       offset = 0;
+                       limit = extra_size;
+                       continue;
+
+               default:
+                       goto invalid;
+               }
+
+               if (size < sizeof(*head))
+                       goto invalid;
+
+               if (limit - offset < size)
+                       goto invalid;
+
+               offset += size;
+       }
+
+done:
+       if (!user->fpsimd)
+               goto invalid;
+
+       return 0;
+
+invalid:
+       return -EINVAL;
+}
+
 static int restore_sigframe(struct pt_regs *regs,
                            struct rt_sigframe __user *sf)
 {
        sigset_t set;
        int i, err;
-       void *aux = sf->uc.uc_mcontext.__reserved;
+       struct user_ctxs user;
 
        err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set));
        if (err == 0)
@@ -116,12 +390,11 @@ static int restore_sigframe(struct pt_regs *regs,
        regs->syscallno = ~0UL;
 
        err |= !valid_user_regs(&regs->user_regs, current);
+       if (err == 0)
+               err = parse_user_sigframe(&user, sf);
 
-       if (err == 0) {
-               struct fpsimd_context *fpsimd_ctx =
-                       container_of(aux, struct fpsimd_context, head);
-               err |= restore_fpsimd_context(fpsimd_ctx);
-       }
+       if (err == 0)
+               err = restore_fpsimd_context(user.fpsimd);
 
        return err;
 }
@@ -162,16 +435,37 @@ badframe:
        return 0;
 }
 
-static int setup_sigframe(struct rt_sigframe __user *sf,
+/* Determine the layout of optional records in the signal frame */
+static int setup_sigframe_layout(struct rt_sigframe_user_layout *user)
+{
+       int err;
+
+       err = sigframe_alloc(user, &user->fpsimd_offset,
+                            sizeof(struct fpsimd_context));
+       if (err)
+               return err;
+
+       /* fault information, if valid */
+       if (current->thread.fault_code) {
+               err = sigframe_alloc(user, &user->esr_offset,
+                                    sizeof(struct esr_context));
+               if (err)
+                       return err;
+       }
+
+       return sigframe_alloc_end(user);
+}
+
+
+static int setup_sigframe(struct rt_sigframe_user_layout *user,
                          struct pt_regs *regs, sigset_t *set)
 {
        int i, err = 0;
-       void *aux = sf->uc.uc_mcontext.__reserved;
-       struct _aarch64_ctx *end;
+       struct rt_sigframe __user *sf = user->sigframe;
 
        /* set up the stack frame for unwinding */
-       __put_user_error(regs->regs[29], &sf->fp, err);
-       __put_user_error(regs->regs[30], &sf->lr, err);
+       __put_user_error(regs->regs[29], &user->next_frame->fp, err);
+       __put_user_error(regs->regs[30], &user->next_frame->lr, err);
 
        for (i = 0; i < 31; i++)
                __put_user_error(regs->regs[i], &sf->uc.uc_mcontext.regs[i],
@@ -185,58 +479,103 @@ static int setup_sigframe(struct rt_sigframe __user *sf,
        err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
 
        if (err == 0) {
-               struct fpsimd_context *fpsimd_ctx =
-                       container_of(aux, struct fpsimd_context, head);
+               struct fpsimd_context __user *fpsimd_ctx =
+                       apply_user_offset(user, user->fpsimd_offset);
                err |= preserve_fpsimd_context(fpsimd_ctx);
-               aux += sizeof(*fpsimd_ctx);
        }
 
        /* fault information, if valid */
-       if (current->thread.fault_code) {
-               struct esr_context *esr_ctx =
-                       container_of(aux, struct esr_context, head);
+       if (err == 0 && user->esr_offset) {
+               struct esr_context __user *esr_ctx =
+                       apply_user_offset(user, user->esr_offset);
+
                __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
                __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
                __put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
-               aux += sizeof(*esr_ctx);
+       }
+
+       if (err == 0 && user->extra_offset) {
+               char __user *sfp = (char __user *)user->sigframe;
+               char __user *userp =
+                       apply_user_offset(user, user->extra_offset);
+
+               struct extra_context __user *extra;
+               struct _aarch64_ctx __user *end;
+               u64 extra_datap;
+               u32 extra_size;
+
+               extra = (struct extra_context __user *)userp;
+               userp += EXTRA_CONTEXT_SIZE;
+
+               end = (struct _aarch64_ctx __user *)userp;
+               userp += TERMINATOR_SIZE;
+
+               /*
+                * extra_datap is just written to the signal frame.
+                * The value gets cast back to a void __user *
+                * during sigreturn.
+                */
+               extra_datap = (__force u64)userp;
+               extra_size = sfp + round_up(user->size, 16) - userp;
+
+               __put_user_error(EXTRA_MAGIC, &extra->head.magic, err);
+               __put_user_error(EXTRA_CONTEXT_SIZE, &extra->head.size, err);
+               __put_user_error(extra_datap, &extra->datap, err);
+               __put_user_error(extra_size, &extra->size, err);
+
+               /* Add the terminator */
+               __put_user_error(0, &end->magic, err);
+               __put_user_error(0, &end->size, err);
        }
 
        /* set the "end" magic */
-       end = aux;
-       __put_user_error(0, &end->magic, err);
-       __put_user_error(0, &end->size, err);
+       if (err == 0) {
+               struct _aarch64_ctx __user *end =
+                       apply_user_offset(user, user->end_offset);
+
+               __put_user_error(0, &end->magic, err);
+               __put_user_error(0, &end->size, err);
+       }
 
        return err;
 }
 
-static struct rt_sigframe __user *get_sigframe(struct ksignal *ksig,
-                                              struct pt_regs *regs)
+static int get_sigframe(struct rt_sigframe_user_layout *user,
+                        struct ksignal *ksig, struct pt_regs *regs)
 {
        unsigned long sp, sp_top;
-       struct rt_sigframe __user *frame;
+       int err;
+
+       init_user_layout(user);
+       err = setup_sigframe_layout(user);
+       if (err)
+               return err;
 
        sp = sp_top = sigsp(regs->sp, ksig);
 
-       sp = (sp - sizeof(struct rt_sigframe)) & ~15;
-       frame = (struct rt_sigframe __user *)sp;
+       sp = round_down(sp - sizeof(struct frame_record), 16);
+       user->next_frame = (struct frame_record __user *)sp;
+
+       sp = round_down(sp, 16) - sigframe_size(user);
+       user->sigframe = (struct rt_sigframe __user *)sp;
 
        /*
         * Check that we can actually write to the signal frame.
         */
-       if (!access_ok(VERIFY_WRITE, frame, sp_top - sp))
-               frame = NULL;
+       if (!access_ok(VERIFY_WRITE, user->sigframe, sp_top - sp))
+               return -EFAULT;
 
-       return frame;
+       return 0;
 }
 
 static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,
-                        void __user *frame, int usig)
+                        struct rt_sigframe_user_layout *user, int usig)
 {
        __sigrestore_t sigtramp;
 
        regs->regs[0] = usig;
-       regs->sp = (unsigned long)frame;
-       regs->regs[29] = regs->sp + offsetof(struct rt_sigframe, fp);
+       regs->sp = (unsigned long)user->sigframe;
+       regs->regs[29] = (unsigned long)&user->next_frame->fp;
        regs->pc = (unsigned long)ka->sa.sa_handler;
 
        if (ka->sa.sa_flags & SA_RESTORER)
@@ -250,20 +589,22 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,
 static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
                          struct pt_regs *regs)
 {
+       struct rt_sigframe_user_layout user;
        struct rt_sigframe __user *frame;
        int err = 0;
 
-       frame = get_sigframe(ksig, regs);
-       if (!frame)
+       if (get_sigframe(&user, ksig, regs))
                return 1;
 
+       frame = user.sigframe;
+
        __put_user_error(0, &frame->uc.uc_flags, err);
        __put_user_error(NULL, &frame->uc.uc_link, err);
 
        err |= __save_altstack(&frame->uc.uc_stack, regs->sp);
-       err |= setup_sigframe(frame, regs, set);
+       err |= setup_sigframe(&user, regs, set);
        if (err == 0) {
-               setup_return(regs, &ksig->ka, frame, usig);
+               setup_return(regs, &ksig->ka, &user, usig);
                if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
                        err |= copy_siginfo_to_user(&frame->info, &ksig->info);
                        regs->regs[1] = (unsigned long)&frame->info;
index feac80c..09d37d6 100644 (file)
@@ -210,6 +210,7 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 
        put_task_stack(tsk);
 }
+EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
 
 void save_stack_trace(struct stack_trace *trace)
 {
index 0805b44..c7c7088 100644 (file)
@@ -140,7 +140,7 @@ static void dump_instr(const char *lvl, struct pt_regs *regs)
        }
 }
 
-static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
+void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 {
        struct stackframe frame;
        unsigned long irq_stack_ptr;
@@ -344,22 +344,24 @@ static int call_undef_hook(struct pt_regs *regs)
 
        if (compat_thumb_mode(regs)) {
                /* 16-bit Thumb instruction */
-               if (get_user(instr, (u16 __user *)pc))
+               __le16 instr_le;
+               if (get_user(instr_le, (__le16 __user *)pc))
                        goto exit;
-               instr = le16_to_cpu(instr);
+               instr = le16_to_cpu(instr_le);
                if (aarch32_insn_is_wide(instr)) {
                        u32 instr2;
 
-                       if (get_user(instr2, (u16 __user *)(pc + 2)))
+                       if (get_user(instr_le, (__le16 __user *)(pc + 2)))
                                goto exit;
-                       instr2 = le16_to_cpu(instr2);
+                       instr2 = le16_to_cpu(instr_le);
                        instr = (instr << 16) | instr2;
                }
        } else {
                /* 32-bit ARM instruction */
-               if (get_user(instr, (u32 __user *)pc))
+               __le32 instr_le;
+               if (get_user(instr_le, (__le32 __user *)pc))
                        goto exit;
-               instr = le32_to_cpu(instr);
+               instr = le32_to_cpu(instr_le);
        }
 
        raw_spin_lock_irqsave(&undef_lock, flags);
@@ -728,8 +730,6 @@ static int bug_handler(struct pt_regs *regs, unsigned int esr)
                break;
 
        case BUG_TRAP_TYPE_WARN:
-               /* Ideally, report_bug() should backtrace for us... but no. */
-               dump_backtrace(regs, NULL);
                break;
 
        default:
index 7492d90..e8f759f 100644 (file)
@@ -37,7 +37,7 @@
 #include <asm/vdso.h>
 #include <asm/vdso_datapage.h>
 
-extern char vdso_start, vdso_end;
+extern char vdso_start[], vdso_end[];
 static unsigned long vdso_pages __ro_after_init;
 
 /*
@@ -125,14 +125,14 @@ static int __init vdso_init(void)
        struct page **vdso_pagelist;
        unsigned long pfn;
 
-       if (memcmp(&vdso_start, "\177ELF", 4)) {
+       if (memcmp(vdso_start, "\177ELF", 4)) {
                pr_err("vDSO is not a valid ELF object!\n");
                return -EINVAL;
        }
 
-       vdso_pages = (&vdso_end - &vdso_start) >> PAGE_SHIFT;
+       vdso_pages = (vdso_end - vdso_start) >> PAGE_SHIFT;
        pr_info("vdso: %ld pages (%ld code @ %p, %ld data @ %p)\n",
-               vdso_pages + 1, vdso_pages, &vdso_start, 1L, vdso_data);
+               vdso_pages + 1, vdso_pages, vdso_start, 1L, vdso_data);
 
        /* Allocate the vDSO pagelist, plus a page for the data. */
        vdso_pagelist = kcalloc(vdso_pages + 1, sizeof(struct page *),
@@ -145,7 +145,7 @@ static int __init vdso_init(void)
 
 
        /* Grab the vDSO code pages. */
-       pfn = sym_to_pfn(&vdso_start);
+       pfn = sym_to_pfn(vdso_start);
 
        for (i = 0; i < vdso_pages; i++)
                vdso_pagelist[i + 1] = pfn_to_page(pfn + i);
index 3216e09..3e340b6 100644 (file)
@@ -95,11 +95,6 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size,
                                  dma_addr_t *dma_handle, gfp_t flags,
                                  unsigned long attrs)
 {
-       if (dev == NULL) {
-               WARN_ONCE(1, "Use an actual device structure for DMA allocation\n");
-               return NULL;
-       }
-
        if (IS_ENABLED(CONFIG_ZONE_DMA) &&
            dev->coherent_dma_mask <= DMA_BIT_MASK(32))
                flags |= GFP_DMA;
@@ -128,10 +123,6 @@ static void __dma_free_coherent(struct device *dev, size_t size,
        bool freed;
        phys_addr_t paddr = dma_to_phys(dev, dma_handle);
 
-       if (dev == NULL) {
-               WARN_ONCE(1, "Use an actual device structure for DMA allocation\n");
-               return;
-       }
 
        freed = dma_release_from_contiguous(dev,
                                        phys_to_page(paddr),
index 37b95df..c7861c9 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/highmem.h>
 #include <linux/perf_event.h>
 #include <linux/preempt.h>
+#include <linux/hugetlb.h>
 
 #include <asm/bug.h>
 #include <asm/cpufeature.h>
@@ -42,6 +43,8 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 
+#include <acpi/ghes.h>
+
 struct fault_info {
        int     (*fn)(unsigned long addr, unsigned int esr,
                      struct pt_regs *regs);
@@ -80,18 +83,35 @@ static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr)
 #endif
 
 /*
- * Dump out the page tables associated with 'addr' in mm 'mm'.
+ * Dump out the page tables associated with 'addr' in the currently active mm.
  */
-void show_pte(struct mm_struct *mm, unsigned long addr)
+void show_pte(unsigned long addr)
 {
+       struct mm_struct *mm;
        pgd_t *pgd;
 
-       if (!mm)
+       if (addr < TASK_SIZE) {
+               /* TTBR0 */
+               mm = current->active_mm;
+               if (mm == &init_mm) {
+                       pr_alert("[%016lx] user address but active_mm is swapper\n",
+                                addr);
+                       return;
+               }
+       } else if (addr >= VA_START) {
+               /* TTBR1 */
                mm = &init_mm;
+       } else {
+               pr_alert("[%016lx] address between user and kernel address ranges\n",
+                        addr);
+               return;
+       }
 
-       pr_alert("pgd = %p\n", mm->pgd);
+       pr_alert("%s pgtable: %luk pages, %u-bit VAs, pgd = %p\n",
+                mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K,
+                VA_BITS, mm->pgd);
        pgd = pgd_offset(mm, addr);
-       pr_alert("[%08lx] *pgd=%016llx", addr, pgd_val(*pgd));
+       pr_alert("[%016lx] *pgd=%016llx", addr, pgd_val(*pgd));
 
        do {
                pud_t *pud;
@@ -196,8 +216,8 @@ static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs,
 /*
  * The kernel tried to access some page that wasn't present.
  */
-static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr,
-                             unsigned int esr, struct pt_regs *regs)
+static void __do_kernel_fault(unsigned long addr, unsigned int esr,
+                             struct pt_regs *regs)
 {
        const char *msg;
 
@@ -227,7 +247,7 @@ static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr,
        pr_alert("Unable to handle kernel %s at virtual address %08lx\n", msg,
                 addr);
 
-       show_pte(mm, addr);
+       show_pte(addr);
        die("Oops", regs, esr);
        bust_spinlocks(0);
        do_exit(SIGKILL);
@@ -239,18 +259,20 @@ static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr,
  */
 static void __do_user_fault(struct task_struct *tsk, unsigned long addr,
                            unsigned int esr, unsigned int sig, int code,
-                           struct pt_regs *regs)
+                           struct pt_regs *regs, int fault)
 {
        struct siginfo si;
        const struct fault_info *inf;
+       unsigned int lsb = 0;
 
        if (unhandled_signal(tsk, sig) && show_unhandled_signals_ratelimited()) {
                inf = esr_to_fault_info(esr);
-               pr_info("%s[%d]: unhandled %s (%d) at 0x%08lx, esr 0x%03x\n",
+               pr_info("%s[%d]: unhandled %s (%d) at 0x%08lx, esr 0x%03x",
                        tsk->comm, task_pid_nr(tsk), inf->name, sig,
                        addr, esr);
-               show_pte(tsk->mm, addr);
-               show_regs(regs);
+               print_vma_addr(KERN_CONT ", in ", regs->pc);
+               pr_cont("\n");
+               __show_regs(regs);
        }
 
        tsk->thread.fault_address = addr;
@@ -259,13 +281,23 @@ static void __do_user_fault(struct task_struct *tsk, unsigned long addr,
        si.si_errno = 0;
        si.si_code = code;
        si.si_addr = (void __user *)addr;
+       /*
+        * Either small page or large page may be poisoned.
+        * In other words, VM_FAULT_HWPOISON_LARGE and
+        * VM_FAULT_HWPOISON are mutually exclusive.
+        */
+       if (fault & VM_FAULT_HWPOISON_LARGE)
+               lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
+       else if (fault & VM_FAULT_HWPOISON)
+               lsb = PAGE_SHIFT;
+       si.si_addr_lsb = lsb;
+
        force_sig_info(sig, &si, tsk);
 }
 
 static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 {
        struct task_struct *tsk = current;
-       struct mm_struct *mm = tsk->active_mm;
        const struct fault_info *inf;
 
        /*
@@ -274,9 +306,9 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
         */
        if (user_mode(regs)) {
                inf = esr_to_fault_info(esr);
-               __do_user_fault(tsk, addr, esr, inf->sig, inf->code, regs);
+               __do_user_fault(tsk, addr, esr, inf->sig, inf->code, regs, 0);
        } else
-               __do_kernel_fault(mm, addr, esr, regs);
+               __do_kernel_fault(addr, esr, regs);
 }
 
 #define VM_FAULT_BADMAP                0x010000
@@ -329,7 +361,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 {
        struct task_struct *tsk;
        struct mm_struct *mm;
-       int fault, sig, code;
+       int fault, sig, code, major = 0;
        unsigned long vm_flags = VM_READ | VM_WRITE;
        unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
@@ -368,6 +400,8 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
                        die("Accessing user space memory outside uaccess.h routines", regs, esr);
        }
 
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
+
        /*
         * As per x86, we may deadlock here. However, since the kernel only
         * validly references user space from well defined areas of the code,
@@ -391,24 +425,42 @@ retry:
        }
 
        fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk);
+       major |= fault & VM_FAULT_MAJOR;
 
-       /*
-        * If we need to retry but a fatal signal is pending, handle the
-        * signal first. We do not need to release the mmap_sem because it
-        * would already be released in __lock_page_or_retry in mm/filemap.c.
-        */
-       if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
-               return 0;
+       if (fault & VM_FAULT_RETRY) {
+               /*
+                * If we need to retry but a fatal signal is pending,
+                * handle the signal first. We do not need to release
+                * the mmap_sem because it would already be released
+                * in __lock_page_or_retry in mm/filemap.c.
+                */
+               if (fatal_signal_pending(current))
+                       return 0;
+
+               /*
+                * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of
+                * starvation.
+                */
+               if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
+                       mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       mm_flags |= FAULT_FLAG_TRIED;
+                       goto retry;
+               }
+       }
+       up_read(&mm->mmap_sem);
 
        /*
-        * Major/minor page fault accounting is only done on the initial
-        * attempt. If we go through a retry, it is extremely likely that the
-        * page will be found in page cache at that point.
+        * Handle the "normal" (no error) case first.
         */
-
-       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
-       if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
-               if (fault & VM_FAULT_MAJOR) {
+       if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
+                             VM_FAULT_BADACCESS)))) {
+               /*
+                * Major/minor page fault accounting is only done
+                * once. If we go through a retry, it is extremely
+                * likely that the page will be found in page cache at
+                * that point.
+                */
+               if (major) {
                        tsk->maj_flt++;
                        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs,
                                      addr);
@@ -417,25 +469,9 @@ retry:
                        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs,
                                      addr);
                }
-               if (fault & VM_FAULT_RETRY) {
-                       /*
-                        * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of
-                        * starvation.
-                        */
-                       mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
-                       mm_flags |= FAULT_FLAG_TRIED;
-                       goto retry;
-               }
-       }
-
-       up_read(&mm->mmap_sem);
 
-       /*
-        * Handle the "normal" case first - VM_FAULT_MAJOR
-        */
-       if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
-                             VM_FAULT_BADACCESS))))
                return 0;
+       }
 
        /*
         * If we are in kernel mode at this point, we have no context to
@@ -461,6 +497,9 @@ retry:
                 */
                sig = SIGBUS;
                code = BUS_ADRERR;
+       } else if (fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) {
+               sig = SIGBUS;
+               code = BUS_MCEERR_AR;
        } else {
                /*
                 * Something tried to access memory that isn't in our memory
@@ -471,11 +510,11 @@ retry:
                        SEGV_ACCERR : SEGV_MAPERR;
        }
 
-       __do_user_fault(tsk, addr, esr, sig, code, regs);
+       __do_user_fault(tsk, addr, esr, sig, code, regs, fault);
        return 0;
 
 no_context:
-       __do_kernel_fault(mm, addr, esr, regs);
+       __do_kernel_fault(addr, esr, regs);
        return 0;
 }
 
@@ -522,6 +561,47 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
        return 1;
 }
 
+/*
+ * This abort handler deals with Synchronous External Abort.
+ * It calls notifiers, and then returns "fault".
+ */
+static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+{
+       struct siginfo info;
+       const struct fault_info *inf;
+       int ret = 0;
+
+       inf = esr_to_fault_info(esr);
+       pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n",
+               inf->name, esr, addr);
+
+       /*
+        * Synchronous aborts may interrupt code which had interrupts masked.
+        * Before calling out into the wider kernel tell the interested
+        * subsystems.
+        */
+       if (IS_ENABLED(CONFIG_ACPI_APEI_SEA)) {
+               if (interrupts_enabled(regs))
+                       nmi_enter();
+
+               ret = ghes_notify_sea();
+
+               if (interrupts_enabled(regs))
+                       nmi_exit();
+       }
+
+       info.si_signo = SIGBUS;
+       info.si_errno = 0;
+       info.si_code  = 0;
+       if (esr & ESR_ELx_FnV)
+               info.si_addr = NULL;
+       else
+               info.si_addr  = (void __user *)addr;
+       arm64_notify_die("", regs, &info, esr);
+
+       return ret;
+}
+
 static const struct fault_info fault_info[] = {
        { do_bad,               SIGBUS,  0,             "ttbr address size fault"       },
        { do_bad,               SIGBUS,  0,             "level 1 address size fault"    },
@@ -539,22 +619,22 @@ static const struct fault_info fault_info[] = {
        { do_page_fault,        SIGSEGV, SEGV_ACCERR,   "level 1 permission fault"      },
        { do_page_fault,        SIGSEGV, SEGV_ACCERR,   "level 2 permission fault"      },
        { do_page_fault,        SIGSEGV, SEGV_ACCERR,   "level 3 permission fault"      },
-       { do_bad,               SIGBUS,  0,             "synchronous external abort"    },
+       { do_sea,               SIGBUS,  0,             "synchronous external abort"    },
        { do_bad,               SIGBUS,  0,             "unknown 17"                    },
        { do_bad,               SIGBUS,  0,             "unknown 18"                    },
        { do_bad,               SIGBUS,  0,             "unknown 19"                    },
-       { do_bad,               SIGBUS,  0,             "synchronous external abort (translation table walk)" },
-       { do_bad,               SIGBUS,  0,             "synchronous external abort (translation table walk)" },
-       { do_bad,               SIGBUS,  0,             "synchronous external abort (translation table walk)" },
-       { do_bad,               SIGBUS,  0,             "synchronous external abort (translation table walk)" },
-       { do_bad,               SIGBUS,  0,             "synchronous parity error"      },
+       { do_sea,               SIGBUS,  0,             "level 0 (translation table walk)"      },
+       { do_sea,               SIGBUS,  0,             "level 1 (translation table walk)"      },
+       { do_sea,               SIGBUS,  0,             "level 2 (translation table walk)"      },
+       { do_sea,               SIGBUS,  0,             "level 3 (translation table walk)"      },
+       { do_sea,               SIGBUS,  0,             "synchronous parity or ECC error" },
        { do_bad,               SIGBUS,  0,             "unknown 25"                    },
        { do_bad,               SIGBUS,  0,             "unknown 26"                    },
        { do_bad,               SIGBUS,  0,             "unknown 27"                    },
-       { do_bad,               SIGBUS,  0,             "synchronous parity error (translation table walk)" },
-       { do_bad,               SIGBUS,  0,             "synchronous parity error (translation table walk)" },
-       { do_bad,               SIGBUS,  0,             "synchronous parity error (translation table walk)" },
-       { do_bad,               SIGBUS,  0,             "synchronous parity error (translation table walk)" },
+       { do_sea,               SIGBUS,  0,             "level 0 synchronous parity error (translation table walk)"     },
+       { do_sea,               SIGBUS,  0,             "level 1 synchronous parity error (translation table walk)"     },
+       { do_sea,               SIGBUS,  0,             "level 2 synchronous parity error (translation table walk)"     },
+       { do_sea,               SIGBUS,  0,             "level 3 synchronous parity error (translation table walk)"     },
        { do_bad,               SIGBUS,  0,             "unknown 32"                    },
        { do_alignment_fault,   SIGBUS,  BUS_ADRALN,    "alignment fault"               },
        { do_bad,               SIGBUS,  0,             "unknown 34"                    },
@@ -589,6 +669,23 @@ static const struct fault_info fault_info[] = {
        { do_bad,               SIGBUS,  0,             "unknown 63"                    },
 };
 
+/*
+ * Handle Synchronous External Aborts that occur in a guest kernel.
+ *
+ * The return value will be zero if the SEA was successfully handled
+ * and non-zero if there was an error processing the error or there was
+ * no error to process.
+ */
+int handle_guest_sea(phys_addr_t addr, unsigned int esr)
+{
+       int ret = -ENOENT;
+
+       if (IS_ENABLED(CONFIG_ACPI_APEI_SEA))
+               ret = ghes_notify_sea();
+
+       return ret;
+}
+
 /*
  * Dispatch a data abort to the relevant handler.
  */
index 7514a00..69b8200 100644 (file)
@@ -136,36 +136,27 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
        pgd_t *pgd;
        pud_t *pud;
-       pmd_t *pmd = NULL;
-       pte_t *pte = NULL;
+       pmd_t *pmd;
 
        pgd = pgd_offset(mm, addr);
        pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd);
        if (!pgd_present(*pgd))
                return NULL;
+
        pud = pud_offset(pgd, addr);
-       if (!pud_present(*pud))
+       if (pud_none(*pud))
                return NULL;
-
-       if (pud_huge(*pud))
+       /* swap or huge page */
+       if (!pud_present(*pud) || pud_huge(*pud))
                return (pte_t *)pud;
+       /* table; check the next level */
+
        pmd = pmd_offset(pud, addr);
-       if (!pmd_present(*pmd))
+       if (pmd_none(*pmd))
                return NULL;
-
-       if (pte_cont(pmd_pte(*pmd))) {
-               pmd = pmd_offset(
-                       pud, (addr & CONT_PMD_MASK));
-               return (pte_t *)pmd;
-       }
-       if (pmd_huge(*pmd))
+       if (!pmd_present(*pmd) || pmd_huge(*pmd))
                return (pte_t *)pmd;
-       pte = pte_offset_kernel(pmd, addr);
-       if (pte_present(*pte) && pte_cont(*pte)) {
-               pte = pte_offset_kernel(
-                       pmd, (addr & CONT_PTE_MASK));
-               return pte;
-       }
+
        return NULL;
 }
 
index 7b0d557..adc208c 100644 (file)
@@ -18,6 +18,7 @@
 
 #include <linux/elf.h>
 #include <linux/fs.h>
+#include <linux/memblock.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/export.h>
@@ -103,12 +104,18 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
  */
 int valid_phys_addr_range(phys_addr_t addr, size_t size)
 {
-       if (addr < PHYS_OFFSET)
-               return 0;
-       if (addr + size > __pa(high_memory - 1) + 1)
-               return 0;
-
-       return 1;
+       /*
+        * Check whether addr is covered by a memory region without the
+        * MEMBLOCK_NOMAP attribute, and whether that region covers the
+        * entire range. In theory, this could lead to false negatives
+        * if the range is covered by distinct but adjacent memory regions
+        * that only differ in other attributes. However, few of such
+        * attributes have been defined, and it is debatable whether it
+        * follows that /dev/mem read() calls should be able traverse
+        * such boundaries.
+        */
+       return memblock_is_region_memory(addr, size) &&
+              memblock_is_map_memory(addr);
 }
 
 /*
index 0c429ec..23c2d89 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/fs.h>
 #include <linux/io.h>
 #include <linux/mm.h>
+#include <linux/vmalloc.h>
 
 #include <asm/barrier.h>
 #include <asm/cputype.h>
index 2f0505b..f32144b 100644 (file)
@@ -70,7 +70,7 @@ struct jit_ctx {
        int idx;
        int epilogue_offset;
        int *offset;
-       u32 *image;
+       __le32 *image;
        u32 stack_size;
 };
 
@@ -131,7 +131,7 @@ static inline int bpf2a64_offset(int bpf_to, int bpf_from,
 
 static void jit_fill_hole(void *area, unsigned int size)
 {
-       u32 *ptr;
+       __le32 *ptr;
        /* We are guaranteed to have aligned memory. */
        for (ptr = area; size >= sizeof(u32); size -= sizeof(u32))
                *ptr++ = cpu_to_le32(AARCH64_BREAK_FAULT);
@@ -874,7 +874,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 
        /* 2. Now, the actual pass. */
 
-       ctx.image = (u32 *)image_ptr;
+       ctx.image = (__le32 *)image_ptr;
        ctx.idx = 0;
 
        build_prologue(&ctx);
index b0140c8..de14d49 100644 (file)
@@ -39,6 +39,21 @@ config ACPI_APEI_PCIEAER
          PCIe AER errors may be reported via APEI firmware first mode.
          Turn on this option to enable the corresponding support.
 
+config ACPI_APEI_SEA
+       bool "APEI Synchronous External Abort logging/recovering support"
+       depends on ARM64 && ACPI_APEI_GHES
+       default y
+       help
+         This option should be enabled if the system supports
+         firmware first handling of SEA (Synchronous External Abort).
+         SEA happens with certain faults of data abort or instruction
+         abort synchronous exceptions on ARMv8 systems. If a system
+         supports firmware first handling of SEA, the platform analyzes
+         and handles hardware error notifications from SEA, and it may then
+         form a HW error record for the OS to parse and handle. This
+         option allows the OS to look for such hardware error record, and
+         take appropriate action.
+
 config ACPI_APEI_MEMORY_FAILURE
        bool "APEI memory error recovering support"
        depends on ACPI_APEI && MEMORY_FAILURE
index 0968816..d661d45 100644 (file)
 #include <linux/aer.h>
 #include <linux/nmi.h>
 #include <linux/sched/clock.h>
+#include <linux/uuid.h>
+#include <linux/ras.h>
 
+#include <acpi/actbl1.h>
 #include <acpi/ghes.h>
 #include <acpi/apei.h>
 #include <asm/tlbflush.h>
+#include <ras/ras_event.h>
 
 #include "apei-internal.h"
 
        ((struct acpi_hest_generic_status *)                            \
         ((struct ghes_estatus_node *)(estatus_node) + 1))
 
+static inline bool is_hest_type_generic_v2(struct ghes *ghes)
+{
+       return ghes->generic->header.type == ACPI_HEST_TYPE_GENERIC_ERROR_V2;
+}
+
 /*
  * This driver isn't really modular, however for the time being,
  * continuing to use module_param is the easiest way to remain
@@ -110,11 +119,7 @@ static DEFINE_MUTEX(ghes_list_mutex);
  * Two virtual pages are used, one for IRQ/PROCESS context, the other for
  * NMI context (optionally).
  */
-#ifdef CONFIG_HAVE_ACPI_APEI_NMI
 #define GHES_IOREMAP_PAGES           2
-#else
-#define GHES_IOREMAP_PAGES           1
-#endif
 #define GHES_IOREMAP_IRQ_PAGE(base)    (base)
 #define GHES_IOREMAP_NMI_PAGE(base)    ((base) + PAGE_SIZE)
 
@@ -133,6 +138,8 @@ static unsigned long ghes_estatus_pool_size_request;
 static struct ghes_estatus_cache *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE];
 static atomic_t ghes_estatus_cache_alloced;
 
+static int ghes_panic_timeout __read_mostly = 30;
+
 static int ghes_ioremap_init(void)
 {
        ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES,
@@ -153,10 +160,14 @@ static void ghes_ioremap_exit(void)
 static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn)
 {
        unsigned long vaddr;
+       phys_addr_t paddr;
+       pgprot_t prot;
 
        vaddr = (unsigned long)GHES_IOREMAP_NMI_PAGE(ghes_ioremap_area->addr);
-       ioremap_page_range(vaddr, vaddr + PAGE_SIZE,
-                          pfn << PAGE_SHIFT, PAGE_KERNEL);
+
+       paddr = pfn << PAGE_SHIFT;
+       prot = arch_apei_get_mem_attribute(paddr);
+       ioremap_page_range(vaddr, vaddr + PAGE_SIZE, paddr, prot);
 
        return (void __iomem *)vaddr;
 }
@@ -240,6 +251,16 @@ static int ghes_estatus_pool_expand(unsigned long len)
        return 0;
 }
 
+static int map_gen_v2(struct ghes *ghes)
+{
+       return apei_map_generic_address(&ghes->generic_v2->read_ack_register);
+}
+
+static void unmap_gen_v2(struct ghes *ghes)
+{
+       apei_unmap_generic_address(&ghes->generic_v2->read_ack_register);
+}
+
 static struct ghes *ghes_new(struct acpi_hest_generic *generic)
 {
        struct ghes *ghes;
@@ -249,10 +270,17 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic)
        ghes = kzalloc(sizeof(*ghes), GFP_KERNEL);
        if (!ghes)
                return ERR_PTR(-ENOMEM);
+
        ghes->generic = generic;
+       if (is_hest_type_generic_v2(ghes)) {
+               rc = map_gen_v2(ghes);
+               if (rc)
+                       goto err_free;
+       }
+
        rc = apei_map_generic_address(&generic->error_status_address);
        if (rc)
-               goto err_free;
+               goto err_unmap_read_ack_addr;
        error_block_length = generic->error_block_length;
        if (error_block_length > GHES_ESTATUS_MAX_SIZE) {
                pr_warning(FW_WARN GHES_PFX
@@ -264,13 +292,16 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic)
        ghes->estatus = kmalloc(error_block_length, GFP_KERNEL);
        if (!ghes->estatus) {
                rc = -ENOMEM;
-               goto err_unmap;
+               goto err_unmap_status_addr;
        }
 
        return ghes;
 
-err_unmap:
+err_unmap_status_addr:
        apei_unmap_generic_address(&generic->error_status_address);
+err_unmap_read_ack_addr:
+       if (is_hest_type_generic_v2(ghes))
+               unmap_gen_v2(ghes);
 err_free:
        kfree(ghes);
        return ERR_PTR(rc);
@@ -280,6 +311,8 @@ static void ghes_fini(struct ghes *ghes)
 {
        kfree(ghes->estatus);
        apei_unmap_generic_address(&ghes->generic->error_status_address);
+       if (is_hest_type_generic_v2(ghes))
+               unmap_gen_v2(ghes);
 }
 
 static inline int ghes_severity(int severity)
@@ -400,8 +433,7 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
        unsigned long pfn;
        int flags = -1;
        int sec_sev = ghes_severity(gdata->error_severity);
-       struct cper_sec_mem_err *mem_err;
-       mem_err = (struct cper_sec_mem_err *)(gdata + 1);
+       struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
 
        if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
                return;
@@ -432,14 +464,22 @@ static void ghes_do_proc(struct ghes *ghes,
        int sev, sec_sev;
        struct acpi_hest_generic_data *gdata;
        guid_t *sec_type;
+       guid_t *fru_id = &NULL_UUID_LE;
+       char *fru_text = "";
 
        sev = ghes_severity(estatus->error_severity);
        apei_estatus_for_each_section(estatus, gdata) {
                sec_type = (guid_t *)gdata->section_type;
                sec_sev = ghes_severity(gdata->error_severity);
+               if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
+                       fru_id = (guid_t *)gdata->fru_id;
+
+               if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
+                       fru_text = gdata->fru_text;
+
                if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
-                       struct cper_sec_mem_err *mem_err;
-                       mem_err = (struct cper_sec_mem_err *)(gdata+1);
+                       struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
+
                        ghes_edac_report_mem_error(ghes, sev, mem_err);
 
                        arch_apei_report_mem_error(sev, mem_err);
@@ -447,8 +487,8 @@ static void ghes_do_proc(struct ghes *ghes,
                }
 #ifdef CONFIG_ACPI_APEI_PCIEAER
                else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
-                       struct cper_sec_pcie *pcie_err;
-                       pcie_err = (struct cper_sec_pcie *)(gdata+1);
+                       struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
+
                        if (sev == GHES_SEV_RECOVERABLE &&
                            sec_sev == GHES_SEV_RECOVERABLE &&
                            pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
@@ -477,6 +517,17 @@ static void ghes_do_proc(struct ghes *ghes,
 
                }
 #endif
+               else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
+                       struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
+
+                       log_arm_hw_error(err);
+               } else {
+                       void *err = acpi_hest_get_payload(gdata);
+
+                       log_non_standard_event(sec_type, fru_id, fru_text,
+                                              sec_sev, err,
+                                              gdata->error_data_length);
+               }
        }
 }
 
@@ -649,6 +700,31 @@ static void ghes_estatus_cache_add(
        rcu_read_unlock();
 }
 
+static int ghes_ack_error(struct acpi_hest_generic_v2 *gv2)
+{
+       int rc;
+       u64 val = 0;
+
+       rc = apei_read(&val, &gv2->read_ack_register);
+       if (rc)
+               return rc;
+
+       val &= gv2->read_ack_preserve << gv2->read_ack_register.bit_offset;
+       val |= gv2->read_ack_write    << gv2->read_ack_register.bit_offset;
+
+       return apei_write(val, &gv2->read_ack_register);
+}
+
+static void __ghes_panic(struct ghes *ghes)
+{
+       __ghes_print_estatus(KERN_EMERG, ghes->generic, ghes->estatus);
+
+       /* reboot to log the error! */
+       if (!panic_timeout)
+               panic_timeout = ghes_panic_timeout;
+       panic("Fatal hardware error!");
+}
+
 static int ghes_proc(struct ghes *ghes)
 {
        int rc;
@@ -656,11 +732,26 @@ static int ghes_proc(struct ghes *ghes)
        rc = ghes_read_estatus(ghes, 0);
        if (rc)
                goto out;
+
+       if (ghes_severity(ghes->estatus->error_severity) >= GHES_SEV_PANIC) {
+               __ghes_panic(ghes);
+       }
+
        if (!ghes_estatus_cached(ghes->estatus)) {
                if (ghes_print_estatus(NULL, ghes->generic, ghes->estatus))
                        ghes_estatus_cache_add(ghes->generic, ghes->estatus);
        }
        ghes_do_proc(ghes, ghes->estatus);
+
+       /*
+        * GHESv2 type HEST entries introduce support for error acknowledgment,
+        * so only acknowledge the error if this support is present.
+        */
+       if (is_hest_type_generic_v2(ghes)) {
+               rc = ghes_ack_error(ghes->generic_v2);
+               if (rc)
+                       return rc;
+       }
 out:
        ghes_clear_estatus(ghes);
        return rc;
@@ -722,6 +813,55 @@ static struct notifier_block ghes_notifier_hed = {
        .notifier_call = ghes_notify_hed,
 };
 
+#ifdef CONFIG_ACPI_APEI_SEA
+static LIST_HEAD(ghes_sea);
+
+/*
+ * Return 0 only if one of the SEA error sources successfully reported an error
+ * record sent from the firmware.
+ */
+int ghes_notify_sea(void)
+{
+       struct ghes *ghes;
+       int ret = -ENOENT;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(ghes, &ghes_sea, list) {
+               if (!ghes_proc(ghes))
+                       ret = 0;
+       }
+       rcu_read_unlock();
+       return ret;
+}
+
+static void ghes_sea_add(struct ghes *ghes)
+{
+       mutex_lock(&ghes_list_mutex);
+       list_add_rcu(&ghes->list, &ghes_sea);
+       mutex_unlock(&ghes_list_mutex);
+}
+
+static void ghes_sea_remove(struct ghes *ghes)
+{
+       mutex_lock(&ghes_list_mutex);
+       list_del_rcu(&ghes->list);
+       mutex_unlock(&ghes_list_mutex);
+       synchronize_rcu();
+}
+#else /* CONFIG_ACPI_APEI_SEA */
+static inline void ghes_sea_add(struct ghes *ghes)
+{
+       pr_err(GHES_PFX "ID: %d, trying to add SEA notification which is not supported\n",
+              ghes->generic->header.source_id);
+}
+
+static inline void ghes_sea_remove(struct ghes *ghes)
+{
+       pr_err(GHES_PFX "ID: %d, trying to remove SEA notification which is not supported\n",
+              ghes->generic->header.source_id);
+}
+#endif /* CONFIG_ACPI_APEI_SEA */
+
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
 /*
  * printk is not safe in NMI context.  So in NMI handler, we allocate
@@ -742,8 +882,6 @@ static atomic_t ghes_in_nmi = ATOMIC_INIT(0);
 
 static LIST_HEAD(ghes_nmi);
 
-static int ghes_panic_timeout  __read_mostly = 30;
-
 static void ghes_proc_in_irq(struct irq_work *irq_work)
 {
        struct llist_node *llnode, *next;
@@ -829,18 +967,6 @@ static void __process_error(struct ghes *ghes)
 #endif
 }
 
-static void __ghes_panic(struct ghes *ghes)
-{
-       oops_begin();
-       ghes_print_queued_estatus();
-       __ghes_print_estatus(KERN_EMERG, ghes->generic, ghes->estatus);
-
-       /* reboot to log the error! */
-       if (panic_timeout == 0)
-               panic_timeout = ghes_panic_timeout;
-       panic("Fatal hardware error!");
-}
-
 static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
 {
        struct ghes *ghes;
@@ -858,8 +984,11 @@ static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
                }
 
                sev = ghes_severity(ghes->estatus->error_severity);
-               if (sev >= GHES_SEV_PANIC)
+               if (sev >= GHES_SEV_PANIC) {
+                       oops_begin();
+                       ghes_print_queued_estatus();
                        __ghes_panic(ghes);
+               }
 
                if (!(ghes->flags & GHES_TO_CLEAR))
                        continue;
@@ -970,6 +1099,14 @@ static int ghes_probe(struct platform_device *ghes_dev)
        case ACPI_HEST_NOTIFY_GPIO:
                break;
 
+       case ACPI_HEST_NOTIFY_SEA:
+               if (!IS_ENABLED(CONFIG_ACPI_APEI_SEA)) {
+                       pr_warn(GHES_PFX "Generic hardware error source: %d notified via SEA is not supported\n",
+                               generic->header.source_id);
+                       rc = -ENOTSUPP;
+                       goto err;
+               }
+               break;
        case ACPI_HEST_NOTIFY_NMI:
                if (!IS_ENABLED(CONFIG_HAVE_ACPI_APEI_NMI)) {
                        pr_warn(GHES_PFX "Generic hardware error source: %d notified via NMI interrupt is not supported!\n",
@@ -1038,6 +1175,9 @@ static int ghes_probe(struct platform_device *ghes_dev)
                mutex_unlock(&ghes_list_mutex);
                break;
 
+       case ACPI_HEST_NOTIFY_SEA:
+               ghes_sea_add(ghes);
+               break;
        case ACPI_HEST_NOTIFY_NMI:
                ghes_nmi_add(ghes);
                break;
@@ -1046,6 +1186,9 @@ static int ghes_probe(struct platform_device *ghes_dev)
        }
        platform_set_drvdata(ghes_dev, ghes);
 
+       /* Handle any pending errors right away */
+       ghes_proc(ghes);
+
        return 0;
 err_edac_unreg:
        ghes_edac_unregister(ghes);
@@ -1085,6 +1228,9 @@ static int ghes_remove(struct platform_device *ghes_dev)
                synchronize_rcu();
                break;
 
+       case ACPI_HEST_NOTIFY_SEA:
+               ghes_sea_remove(ghes);
+               break;
        case ACPI_HEST_NOTIFY_NMI:
                ghes_nmi_remove(ghes);
                break;
index 8f2a98e..456b488 100644 (file)
@@ -52,6 +52,7 @@ static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] = {
        [ACPI_HEST_TYPE_AER_ENDPOINT] = sizeof(struct acpi_hest_aer),
        [ACPI_HEST_TYPE_AER_BRIDGE] = sizeof(struct acpi_hest_aer_bridge),
        [ACPI_HEST_TYPE_GENERIC_ERROR] = sizeof(struct acpi_hest_generic),
+       [ACPI_HEST_TYPE_GENERIC_ERROR_V2] = sizeof(struct acpi_hest_generic_v2),
 };
 
 static int hest_esrc_len(struct acpi_hest_header *hest_hdr)
@@ -141,7 +142,8 @@ static int __init hest_parse_ghes_count(struct acpi_hest_header *hest_hdr, void
 {
        int *count = data;
 
-       if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR)
+       if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR ||
+           hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2)
                (*count)++;
        return 0;
 }
@@ -152,7 +154,8 @@ static int __init hest_parse_ghes(struct acpi_hest_header *hest_hdr, void *data)
        struct ghes_arr *ghes_arr = data;
        int rc, i;
 
-       if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR)
+       if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR &&
+           hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR_V2)
                return 0;
 
        if (!((struct acpi_hest_generic *)hest_hdr)->enabled)
index 797b28d..d048f72 100644 (file)
@@ -234,21 +234,6 @@ static struct acpi_iort_node *iort_scan_node(enum acpi_iort_node_type type,
        return NULL;
 }
 
-static acpi_status
-iort_match_type_callback(struct acpi_iort_node *node, void *context)
-{
-       return AE_OK;
-}
-
-bool iort_node_match(u8 type)
-{
-       struct acpi_iort_node *node;
-
-       node = iort_scan_node(type, iort_match_type_callback, NULL);
-
-       return node != NULL;
-}
-
 static acpi_status iort_match_node_callback(struct acpi_iort_node *node,
                                            void *context)
 {
index 2af7001..ccd239a 100644 (file)
@@ -17,6 +17,8 @@ config DEVMEM
 
 config DEVKMEM
        bool "/dev/kmem virtual device support"
+       # On arm64, VMALLOC_START < PAGE_OFFSET, which confuses kmem read/write
+       depends on !ARM64
        help
          Say Y here if you want to support the /dev/kmem device. The
          /dev/kmem device is rarely used, but can be used for certain
index d425374..48a8f69 100644 (file)
 #include <linux/acpi.h>
 #include <linux/pci.h>
 #include <linux/aer.h>
+#include <linux/printk.h>
+#include <linux/bcd.h>
+#include <acpi/ghes.h>
+#include <ras/ras_event.h>
 
 #define INDENT_SP      " "
 
@@ -107,12 +111,15 @@ void cper_print_bits(const char *pfx, unsigned int bits,
 static const char * const proc_type_strs[] = {
        "IA32/X64",
        "IA64",
+       "ARM",
 };
 
 static const char * const proc_isa_strs[] = {
        "IA32",
        "IA64",
        "X64",
+       "ARM A32/T32",
+       "ARM A64",
 };
 
 static const char * const proc_error_type_strs[] = {
@@ -181,6 +188,122 @@ static void cper_print_proc_generic(const char *pfx,
                printk("%s""IP: 0x%016llx\n", pfx, proc->ip);
 }
 
+#if defined(CONFIG_ARM64) || defined(CONFIG_ARM)
+static const char * const arm_reg_ctx_strs[] = {
+       "AArch32 general purpose registers",
+       "AArch32 EL1 context registers",
+       "AArch32 EL2 context registers",
+       "AArch32 secure context registers",
+       "AArch64 general purpose registers",
+       "AArch64 EL1 context registers",
+       "AArch64 EL2 context registers",
+       "AArch64 EL3 context registers",
+       "Misc. system register structure",
+};
+
+static void cper_print_proc_arm(const char *pfx,
+                               const struct cper_sec_proc_arm *proc)
+{
+       int i, len, max_ctx_type;
+       struct cper_arm_err_info *err_info;
+       struct cper_arm_ctx_info *ctx_info;
+       char newpfx[64];
+
+       printk("%sMIDR: 0x%016llx\n", pfx, proc->midr);
+
+       len = proc->section_length - (sizeof(*proc) +
+               proc->err_info_num * (sizeof(*err_info)));
+       if (len < 0) {
+               printk("%ssection length: %d\n", pfx, proc->section_length);
+               printk("%ssection length is too small\n", pfx);
+               printk("%sfirmware-generated error record is incorrect\n", pfx);
+               printk("%sERR_INFO_NUM is %d\n", pfx, proc->err_info_num);
+               return;
+       }
+
+       if (proc->validation_bits & CPER_ARM_VALID_MPIDR)
+               printk("%sMultiprocessor Affinity Register (MPIDR): 0x%016llx\n",
+                       pfx, proc->mpidr);
+
+       if (proc->validation_bits & CPER_ARM_VALID_AFFINITY_LEVEL)
+               printk("%serror affinity level: %d\n", pfx,
+                       proc->affinity_level);
+
+       if (proc->validation_bits & CPER_ARM_VALID_RUNNING_STATE) {
+               printk("%srunning state: 0x%x\n", pfx, proc->running_state);
+               printk("%sPower State Coordination Interface state: %d\n",
+                       pfx, proc->psci_state);
+       }
+
+       snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
+
+       err_info = (struct cper_arm_err_info *)(proc + 1);
+       for (i = 0; i < proc->err_info_num; i++) {
+               printk("%sError info structure %d:\n", pfx, i);
+
+               printk("%snum errors: %d\n", pfx, err_info->multiple_error + 1);
+
+               if (err_info->validation_bits & CPER_ARM_INFO_VALID_FLAGS) {
+                       if (err_info->flags & CPER_ARM_INFO_FLAGS_FIRST)
+                               printk("%sfirst error captured\n", newpfx);
+                       if (err_info->flags & CPER_ARM_INFO_FLAGS_LAST)
+                               printk("%slast error captured\n", newpfx);
+                       if (err_info->flags & CPER_ARM_INFO_FLAGS_PROPAGATED)
+                               printk("%spropagated error captured\n",
+                                      newpfx);
+                       if (err_info->flags & CPER_ARM_INFO_FLAGS_OVERFLOW)
+                               printk("%soverflow occurred, error info is incomplete\n",
+                                      newpfx);
+               }
+
+               printk("%serror_type: %d, %s\n", newpfx, err_info->type,
+                       err_info->type < ARRAY_SIZE(proc_error_type_strs) ?
+                       proc_error_type_strs[err_info->type] : "unknown");
+               if (err_info->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO)
+                       printk("%serror_info: 0x%016llx\n", newpfx,
+                              err_info->error_info);
+               if (err_info->validation_bits & CPER_ARM_INFO_VALID_VIRT_ADDR)
+                       printk("%svirtual fault address: 0x%016llx\n",
+                               newpfx, err_info->virt_fault_addr);
+               if (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR)
+                       printk("%sphysical fault address: 0x%016llx\n",
+                               newpfx, err_info->physical_fault_addr);
+               err_info += 1;
+       }
+
+       ctx_info = (struct cper_arm_ctx_info *)err_info;
+       max_ctx_type = ARRAY_SIZE(arm_reg_ctx_strs) - 1;
+       for (i = 0; i < proc->context_info_num; i++) {
+               int size = sizeof(*ctx_info) + ctx_info->size;
+
+               printk("%sContext info structure %d:\n", pfx, i);
+               if (len < size) {
+                       printk("%ssection length is too small\n", newpfx);
+                       printk("%sfirmware-generated error record is incorrect\n", pfx);
+                       return;
+               }
+               if (ctx_info->type > max_ctx_type) {
+                       printk("%sInvalid context type: %d (max: %d)\n",
+                               newpfx, ctx_info->type, max_ctx_type);
+                       return;
+               }
+               printk("%sregister context type: %s\n", newpfx,
+                       arm_reg_ctx_strs[ctx_info->type]);
+               print_hex_dump(newpfx, "", DUMP_PREFIX_OFFSET, 16, 4,
+                               (ctx_info + 1), ctx_info->size, 0);
+               len -= size;
+               ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + size);
+       }
+
+       if (len > 0) {
+               printk("%sVendor specific error info has %u bytes:\n", pfx,
+                      len);
+               print_hex_dump(newpfx, "", DUMP_PREFIX_OFFSET, 16, 4, ctx_info,
+                               len, true);
+       }
+}
+#endif
+
 static const char * const mem_err_type_strs[] = {
        "unknown",
        "no error",
@@ -386,13 +509,38 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
        pfx, pcie->bridge.secondary_status, pcie->bridge.control);
 }
 
-static void cper_estatus_print_section(
-       const char *pfx, const struct acpi_hest_generic_data *gdata, int sec_no)
+static void cper_print_tstamp(const char *pfx,
+                                  struct acpi_hest_generic_data_v300 *gdata)
+{
+       __u8 hour, min, sec, day, mon, year, century, *timestamp;
+
+       if (gdata->validation_bits & ACPI_HEST_GEN_VALID_TIMESTAMP) {
+               timestamp = (__u8 *)&(gdata->time_stamp);
+               sec       = bcd2bin(timestamp[0]);
+               min       = bcd2bin(timestamp[1]);
+               hour      = bcd2bin(timestamp[2]);
+               day       = bcd2bin(timestamp[4]);
+               mon       = bcd2bin(timestamp[5]);
+               year      = bcd2bin(timestamp[6]);
+               century   = bcd2bin(timestamp[7]);
+
+               printk("%s%ststamp: %02d%02d-%02d-%02d %02d:%02d:%02d\n", pfx,
+                      (timestamp[3] & 0x1 ? "precise " : "imprecise "),
+                      century, year, mon, day, hour, min, sec);
+       }
+}
+
+static void
+cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata,
+                          int sec_no)
 {
        uuid_le *sec_type = (uuid_le *)gdata->section_type;
        __u16 severity;
        char newpfx[64];
 
+       if (acpi_hest_get_version(gdata) >= 3)
+               cper_print_tstamp(pfx, (struct acpi_hest_generic_data_v300 *)gdata);
+
        severity = gdata->error_severity;
        printk("%s""Error %d, type: %s\n", pfx, sec_no,
               cper_severity_str(severity));
@@ -403,14 +551,16 @@ static void cper_estatus_print_section(
 
        snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
        if (!uuid_le_cmp(*sec_type, CPER_SEC_PROC_GENERIC)) {
-               struct cper_sec_proc_generic *proc_err = (void *)(gdata + 1);
+               struct cper_sec_proc_generic *proc_err = acpi_hest_get_payload(gdata);
+
                printk("%s""section_type: general processor error\n", newpfx);
                if (gdata->error_data_length >= sizeof(*proc_err))
                        cper_print_proc_generic(newpfx, proc_err);
                else
                        goto err_section_too_small;
        } else if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
-               struct cper_sec_mem_err *mem_err = (void *)(gdata + 1);
+               struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
+
                printk("%s""section_type: memory error\n", newpfx);
                if (gdata->error_data_length >=
                    sizeof(struct cper_sec_mem_err_old))
@@ -419,14 +569,32 @@ static void cper_estatus_print_section(
                else
                        goto err_section_too_small;
        } else if (!uuid_le_cmp(*sec_type, CPER_SEC_PCIE)) {
-               struct cper_sec_pcie *pcie = (void *)(gdata + 1);
+               struct cper_sec_pcie *pcie = acpi_hest_get_payload(gdata);
+
                printk("%s""section_type: PCIe error\n", newpfx);
                if (gdata->error_data_length >= sizeof(*pcie))
                        cper_print_pcie(newpfx, pcie, gdata);
                else
                        goto err_section_too_small;
-       } else
-               printk("%s""section type: unknown, %pUl\n", newpfx, sec_type);
+#if defined(CONFIG_ARM64) || defined(CONFIG_ARM)
+       } else if (!uuid_le_cmp(*sec_type, CPER_SEC_PROC_ARM)) {
+               struct cper_sec_proc_arm *arm_err = acpi_hest_get_payload(gdata);
+
+               printk("%ssection_type: ARM processor error\n", newpfx);
+               if (gdata->error_data_length >= sizeof(*arm_err))
+                       cper_print_proc_arm(newpfx, arm_err);
+               else
+                       goto err_section_too_small;
+#endif
+       } else {
+               const void *err = acpi_hest_get_payload(gdata);
+
+               printk("%ssection type: unknown, %pUl\n", newpfx, sec_type);
+               printk("%ssection length: %#x\n", newpfx,
+                      gdata->error_data_length);
+               print_hex_dump(newpfx, "", DUMP_PREFIX_OFFSET, 16, 4, err,
+                              gdata->error_data_length, true);
+       }
 
        return;
 
@@ -438,7 +606,7 @@ void cper_estatus_print(const char *pfx,
                        const struct acpi_hest_generic_status *estatus)
 {
        struct acpi_hest_generic_data *gdata;
-       unsigned int data_len, gedata_len;
+       unsigned int data_len;
        int sec_no = 0;
        char newpfx[64];
        __u16 severity;
@@ -452,11 +620,11 @@ void cper_estatus_print(const char *pfx,
        data_len = estatus->data_length;
        gdata = (struct acpi_hest_generic_data *)(estatus + 1);
        snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
-       while (data_len >= sizeof(*gdata)) {
-               gedata_len = gdata->error_data_length;
+
+       while (data_len >= acpi_hest_get_size(gdata)) {
                cper_estatus_print_section(newpfx, gdata, sec_no);
-               data_len -= gedata_len + sizeof(*gdata);
-               gdata = (void *)(gdata + 1) + gedata_len;
+               data_len -= acpi_hest_get_record_size(gdata);
+               gdata = acpi_hest_get_next(gdata);
                sec_no++;
        }
 }
@@ -486,12 +654,14 @@ int cper_estatus_check(const struct acpi_hest_generic_status *estatus)
                return rc;
        data_len = estatus->data_length;
        gdata = (struct acpi_hest_generic_data *)(estatus + 1);
-       while (data_len >= sizeof(*gdata)) {
-               gedata_len = gdata->error_data_length;
-               if (gedata_len > data_len - sizeof(*gdata))
+
+       while (data_len >= acpi_hest_get_size(gdata)) {
+               gedata_len = acpi_hest_get_error_length(gdata);
+               if (gedata_len > data_len - acpi_hest_get_size(gdata))
                        return -EINVAL;
-               data_len -= gedata_len + sizeof(*gdata);
-               gdata = (void *)(gdata + 1) + gedata_len;
+
+               data_len -= acpi_hest_get_record_size(gdata);
+               gdata = acpi_hest_get_next(gdata);
        }
        if (data_len)
                return -EINVAL;
index 676232a..f1fd5f4 100644 (file)
@@ -39,7 +39,6 @@ config ARM_GIC_V3_ITS
        bool
        depends on PCI
        depends on PCI_MSI
-       select ACPI_IORT if ACPI
 
 config ARM_NVIC
        bool
index aa587ed..e5197ff 100644 (file)
@@ -3,9 +3,10 @@
 #
 
 menu "Performance monitor support"
+       depends on PERF_EVENTS
 
 config ARM_PMU
-       depends on PERF_EVENTS && (ARM || ARM64)
+       depends on ARM || ARM64
        bool "ARM PMU framework"
        default y
        help
@@ -18,7 +19,7 @@ config ARM_PMU_ACPI
 
 config QCOM_L2_PMU
        bool "Qualcomm Technologies L2-cache PMU"
-       depends on ARCH_QCOM && ARM64 && PERF_EVENTS && ACPI
+       depends on ARCH_QCOM && ARM64 && ACPI
          help
          Provides support for the L2 cache performance monitor unit (PMU)
          in Qualcomm Technologies processors.
@@ -27,7 +28,7 @@ config QCOM_L2_PMU
 
 config QCOM_L3_PMU
        bool "Qualcomm Technologies L3-cache PMU"
-       depends on ARCH_QCOM && ARM64 && PERF_EVENTS && ACPI
+       depends on ARCH_QCOM && ARM64 && ACPI
        select QCOM_IRQ_COMBINER
        help
           Provides support for the L3 cache performance monitor unit (PMU)
@@ -36,7 +37,7 @@ config QCOM_L3_PMU
           monitoring L3 cache events.
 
 config XGENE_PMU
-        depends on PERF_EVENTS && ARCH_XGENE
+        depends on ARCH_XGENE
         bool "APM X-Gene SoC PMU"
         default n
         help
index 35b5289..e841282 100644 (file)
@@ -37,6 +37,8 @@
 
 #define CSW_CSWCR                       0x0000
 #define  CSW_CSWCR_DUALMCB_MASK         BIT(0)
+#define  CSW_CSWCR_MCB0_ROUTING(x)     (((x) & 0x0C) >> 2)
+#define  CSW_CSWCR_MCB1_ROUTING(x)     (((x) & 0x30) >> 4)
 #define MCBADDRMR                       0x0000
 #define  MCBADDRMR_DUALMCU_MODE_MASK    BIT(2)
 
 #define  PCPPMU_INT_L3C                BIT(2)
 #define  PCPPMU_INT_IOB                BIT(3)
 
+#define  PCPPMU_V3_INTMASK     0x00FF33FF
+#define  PCPPMU_V3_INTENMASK   0xFFFFFFFF
+#define  PCPPMU_V3_INTCLRMASK  0xFF00CC00
+#define  PCPPMU_V3_INT_MCU     0x000000FF
+#define  PCPPMU_V3_INT_MCB     0x00000300
+#define  PCPPMU_V3_INT_L3C     0x00FF0000
+#define  PCPPMU_V3_INT_IOB     0x00003000
+
 #define PMU_MAX_COUNTERS       4
-#define PMU_CNT_MAX_PERIOD     0x100000000ULL
+#define PMU_CNT_MAX_PERIOD     0xFFFFFFFFULL
+#define PMU_V3_CNT_MAX_PERIOD  0xFFFFFFFFFFFFFFFFULL
 #define PMU_OVERFLOW_MASK      0xF
 #define PMU_PMCR_E             BIT(0)
 #define PMU_PMCR_P             BIT(1)
 #define PMU_PMOVSR             0xC80
 #define PMU_PMCR               0xE04
 
+/* PMU registers for V3 */
+#define PMU_PMOVSCLR           0xC80
+#define PMU_PMOVSSET           0xCC0
+
 #define to_pmu_dev(p)     container_of(p, struct xgene_pmu_dev, pmu)
 #define GET_CNTR(ev)      (ev->hw.idx)
 #define GET_EVENTID(ev)   (ev->hw.config & 0xFFULL)
@@ -96,14 +111,33 @@ struct xgene_pmu_dev {
        struct perf_event *pmu_counter_event[PMU_MAX_COUNTERS];
 };
 
+struct xgene_pmu_ops {
+       void (*mask_int)(struct xgene_pmu *pmu);
+       void (*unmask_int)(struct xgene_pmu *pmu);
+       u64 (*read_counter)(struct xgene_pmu_dev *pmu, int idx);
+       void (*write_counter)(struct xgene_pmu_dev *pmu, int idx, u64 val);
+       void (*write_evttype)(struct xgene_pmu_dev *pmu_dev, int idx, u32 val);
+       void (*write_agentmsk)(struct xgene_pmu_dev *pmu_dev, u32 val);
+       void (*write_agent1msk)(struct xgene_pmu_dev *pmu_dev, u32 val);
+       void (*enable_counter)(struct xgene_pmu_dev *pmu_dev, int idx);
+       void (*disable_counter)(struct xgene_pmu_dev *pmu_dev, int idx);
+       void (*enable_counter_int)(struct xgene_pmu_dev *pmu_dev, int idx);
+       void (*disable_counter_int)(struct xgene_pmu_dev *pmu_dev, int idx);
+       void (*reset_counters)(struct xgene_pmu_dev *pmu_dev);
+       void (*start_counters)(struct xgene_pmu_dev *pmu_dev);
+       void (*stop_counters)(struct xgene_pmu_dev *pmu_dev);
+};
+
 struct xgene_pmu {
        struct device *dev;
        int version;
        void __iomem *pcppmu_csr;
        u32 mcb_active_mask;
        u32 mc_active_mask;
+       u32 l3c_active_mask;
        cpumask_t cpu;
        raw_spinlock_t lock;
+       const struct xgene_pmu_ops *ops;
        struct list_head l3cpmus;
        struct list_head iobpmus;
        struct list_head mcbpmus;
@@ -125,11 +159,13 @@ struct xgene_pmu_data {
 enum xgene_pmu_version {
        PCP_PMU_V1 = 1,
        PCP_PMU_V2,
+       PCP_PMU_V3,
 };
 
 enum xgene_pmu_dev_type {
        PMU_TYPE_L3C = 0,
        PMU_TYPE_IOB,
+       PMU_TYPE_IOB_SLOW,
        PMU_TYPE_MCB,
        PMU_TYPE_MC,
 };
@@ -195,6 +231,56 @@ static const struct attribute_group mc_pmu_format_attr_group = {
        .attrs = mc_pmu_format_attrs,
 };
 
+static struct attribute *l3c_pmu_v3_format_attrs[] = {
+       XGENE_PMU_FORMAT_ATTR(l3c_eventid, "config:0-39"),
+       NULL,
+};
+
+static struct attribute *iob_pmu_v3_format_attrs[] = {
+       XGENE_PMU_FORMAT_ATTR(iob_eventid, "config:0-47"),
+       NULL,
+};
+
+static struct attribute *iob_slow_pmu_v3_format_attrs[] = {
+       XGENE_PMU_FORMAT_ATTR(iob_slow_eventid, "config:0-16"),
+       NULL,
+};
+
+static struct attribute *mcb_pmu_v3_format_attrs[] = {
+       XGENE_PMU_FORMAT_ATTR(mcb_eventid, "config:0-35"),
+       NULL,
+};
+
+static struct attribute *mc_pmu_v3_format_attrs[] = {
+       XGENE_PMU_FORMAT_ATTR(mc_eventid, "config:0-44"),
+       NULL,
+};
+
+static const struct attribute_group l3c_pmu_v3_format_attr_group = {
+       .name = "format",
+       .attrs = l3c_pmu_v3_format_attrs,
+};
+
+static const struct attribute_group iob_pmu_v3_format_attr_group = {
+       .name = "format",
+       .attrs = iob_pmu_v3_format_attrs,
+};
+
+static const struct attribute_group iob_slow_pmu_v3_format_attr_group = {
+       .name = "format",
+       .attrs = iob_slow_pmu_v3_format_attrs,
+};
+
+static const struct attribute_group mcb_pmu_v3_format_attr_group = {
+       .name = "format",
+       .attrs = mcb_pmu_v3_format_attrs,
+};
+
+static const struct attribute_group mc_pmu_v3_format_attr_group = {
+       .name = "format",
+       .attrs = mc_pmu_v3_format_attrs,
+};
+
 /*
  * sysfs event attributes
  */
@@ -311,6 +397,219 @@ static const struct attribute_group mc_pmu_events_attr_group = {
        .attrs = mc_pmu_events_attrs,
 };
 
+static struct attribute *l3c_pmu_v3_events_attrs[] = {
+       XGENE_PMU_EVENT_ATTR(cycle-count,                       0x00),
+       XGENE_PMU_EVENT_ATTR(read-hit,                          0x01),
+       XGENE_PMU_EVENT_ATTR(read-miss,                         0x02),
+       XGENE_PMU_EVENT_ATTR(index-flush-eviction,              0x03),
+       XGENE_PMU_EVENT_ATTR(write-caused-replacement,          0x04),
+       XGENE_PMU_EVENT_ATTR(write-not-caused-replacement,      0x05),
+       XGENE_PMU_EVENT_ATTR(clean-eviction,                    0x06),
+       XGENE_PMU_EVENT_ATTR(dirty-eviction,                    0x07),
+       XGENE_PMU_EVENT_ATTR(read,                              0x08),
+       XGENE_PMU_EVENT_ATTR(write,                             0x09),
+       XGENE_PMU_EVENT_ATTR(request,                           0x0a),
+       XGENE_PMU_EVENT_ATTR(tq-bank-conflict-issue-stall,      0x0b),
+       XGENE_PMU_EVENT_ATTR(tq-full,                           0x0c),
+       XGENE_PMU_EVENT_ATTR(ackq-full,                         0x0d),
+       XGENE_PMU_EVENT_ATTR(wdb-full,                          0x0e),
+       XGENE_PMU_EVENT_ATTR(odb-full,                          0x10),
+       XGENE_PMU_EVENT_ATTR(wbq-full,                          0x11),
+       XGENE_PMU_EVENT_ATTR(input-req-async-fifo-stall,        0x12),
+       XGENE_PMU_EVENT_ATTR(output-req-async-fifo-stall,       0x13),
+       XGENE_PMU_EVENT_ATTR(output-data-async-fifo-stall,      0x14),
+       XGENE_PMU_EVENT_ATTR(total-insertion,                   0x15),
+       XGENE_PMU_EVENT_ATTR(sip-insertions-r-set,              0x16),
+       XGENE_PMU_EVENT_ATTR(sip-insertions-r-clear,            0x17),
+       XGENE_PMU_EVENT_ATTR(dip-insertions-r-set,              0x18),
+       XGENE_PMU_EVENT_ATTR(dip-insertions-r-clear,            0x19),
+       XGENE_PMU_EVENT_ATTR(dip-insertions-force-r-set,        0x1a),
+       XGENE_PMU_EVENT_ATTR(egression,                         0x1b),
+       XGENE_PMU_EVENT_ATTR(replacement,                       0x1c),
+       XGENE_PMU_EVENT_ATTR(old-replacement,                   0x1d),
+       XGENE_PMU_EVENT_ATTR(young-replacement,                 0x1e),
+       XGENE_PMU_EVENT_ATTR(r-set-replacement,                 0x1f),
+       XGENE_PMU_EVENT_ATTR(r-clear-replacement,               0x20),
+       XGENE_PMU_EVENT_ATTR(old-r-replacement,                 0x21),
+       XGENE_PMU_EVENT_ATTR(old-nr-replacement,                0x22),
+       XGENE_PMU_EVENT_ATTR(young-r-replacement,               0x23),
+       XGENE_PMU_EVENT_ATTR(young-nr-replacement,              0x24),
+       XGENE_PMU_EVENT_ATTR(bloomfilter-clearing,              0x25),
+       XGENE_PMU_EVENT_ATTR(generation-flip,                   0x26),
+       XGENE_PMU_EVENT_ATTR(vcc-droop-detected,                0x27),
+       NULL,
+};
+
+static struct attribute *iob_fast_pmu_v3_events_attrs[] = {
+       XGENE_PMU_EVENT_ATTR(cycle-count,                       0x00),
+       XGENE_PMU_EVENT_ATTR(pa-req-buf-alloc-all,              0x01),
+       XGENE_PMU_EVENT_ATTR(pa-req-buf-alloc-rd,               0x02),
+       XGENE_PMU_EVENT_ATTR(pa-req-buf-alloc-wr,               0x03),
+       XGENE_PMU_EVENT_ATTR(pa-all-cp-req,                     0x04),
+       XGENE_PMU_EVENT_ATTR(pa-cp-blk-req,                     0x05),
+       XGENE_PMU_EVENT_ATTR(pa-cp-ptl-req,                     0x06),
+       XGENE_PMU_EVENT_ATTR(pa-cp-rd-req,                      0x07),
+       XGENE_PMU_EVENT_ATTR(pa-cp-wr-req,                      0x08),
+       XGENE_PMU_EVENT_ATTR(ba-all-req,                        0x09),
+       XGENE_PMU_EVENT_ATTR(ba-rd-req,                         0x0a),
+       XGENE_PMU_EVENT_ATTR(ba-wr-req,                         0x0b),
+       XGENE_PMU_EVENT_ATTR(pa-rd-shared-req-issued,           0x10),
+       XGENE_PMU_EVENT_ATTR(pa-rd-exclusive-req-issued,        0x11),
+       XGENE_PMU_EVENT_ATTR(pa-wr-invalidate-req-issued-stashable, 0x12),
+       XGENE_PMU_EVENT_ATTR(pa-wr-invalidate-req-issued-nonstashable, 0x13),
+       XGENE_PMU_EVENT_ATTR(pa-wr-back-req-issued-stashable,   0x14),
+       XGENE_PMU_EVENT_ATTR(pa-wr-back-req-issued-nonstashable, 0x15),
+       XGENE_PMU_EVENT_ATTR(pa-ptl-wr-req,                     0x16),
+       XGENE_PMU_EVENT_ATTR(pa-ptl-rd-req,                     0x17),
+       XGENE_PMU_EVENT_ATTR(pa-wr-back-clean-data,             0x18),
+       XGENE_PMU_EVENT_ATTR(pa-wr-back-cancelled-on-SS,        0x1b),
+       XGENE_PMU_EVENT_ATTR(pa-barrier-occurrence,             0x1c),
+       XGENE_PMU_EVENT_ATTR(pa-barrier-cycles,                 0x1d),
+       XGENE_PMU_EVENT_ATTR(pa-total-cp-snoops,                0x20),
+       XGENE_PMU_EVENT_ATTR(pa-rd-shared-snoop,                0x21),
+       XGENE_PMU_EVENT_ATTR(pa-rd-shared-snoop-hit,            0x22),
+       XGENE_PMU_EVENT_ATTR(pa-rd-exclusive-snoop,             0x23),
+       XGENE_PMU_EVENT_ATTR(pa-rd-exclusive-snoop-hit,         0x24),
+       XGENE_PMU_EVENT_ATTR(pa-rd-wr-invalid-snoop,            0x25),
+       XGENE_PMU_EVENT_ATTR(pa-rd-wr-invalid-snoop-hit,        0x26),
+       XGENE_PMU_EVENT_ATTR(pa-req-buffer-full,                0x28),
+       XGENE_PMU_EVENT_ATTR(cswlf-outbound-req-fifo-full,      0x29),
+       XGENE_PMU_EVENT_ATTR(cswlf-inbound-snoop-fifo-backpressure, 0x2a),
+       XGENE_PMU_EVENT_ATTR(cswlf-outbound-lack-fifo-full,     0x2b),
+       XGENE_PMU_EVENT_ATTR(cswlf-inbound-gack-fifo-backpressure, 0x2c),
+       XGENE_PMU_EVENT_ATTR(cswlf-outbound-data-fifo-full,     0x2d),
+       XGENE_PMU_EVENT_ATTR(cswlf-inbound-data-fifo-backpressure, 0x2e),
+       XGENE_PMU_EVENT_ATTR(cswlf-inbound-req-backpressure,    0x2f),
+       NULL,
+};
+
+static struct attribute *iob_slow_pmu_v3_events_attrs[] = {
+       XGENE_PMU_EVENT_ATTR(cycle-count,                       0x00),
+       XGENE_PMU_EVENT_ATTR(pa-axi0-rd-req,                    0x01),
+       XGENE_PMU_EVENT_ATTR(pa-axi0-wr-req,                    0x02),
+       XGENE_PMU_EVENT_ATTR(pa-axi1-rd-req,                    0x03),
+       XGENE_PMU_EVENT_ATTR(pa-axi1-wr-req,                    0x04),
+       XGENE_PMU_EVENT_ATTR(ba-all-axi-req,                    0x07),
+       XGENE_PMU_EVENT_ATTR(ba-axi-rd-req,                     0x08),
+       XGENE_PMU_EVENT_ATTR(ba-axi-wr-req,                     0x09),
+       XGENE_PMU_EVENT_ATTR(ba-free-list-empty,                0x10),
+       NULL,
+};
+
+static struct attribute *mcb_pmu_v3_events_attrs[] = {
+       XGENE_PMU_EVENT_ATTR(cycle-count,                       0x00),
+       XGENE_PMU_EVENT_ATTR(req-receive,                       0x01),
+       XGENE_PMU_EVENT_ATTR(rd-req-recv,                       0x02),
+       XGENE_PMU_EVENT_ATTR(rd-req-recv-2,                     0x03),
+       XGENE_PMU_EVENT_ATTR(wr-req-recv,                       0x04),
+       XGENE_PMU_EVENT_ATTR(wr-req-recv-2,                     0x05),
+       XGENE_PMU_EVENT_ATTR(rd-req-sent-to-mcu,                0x06),
+       XGENE_PMU_EVENT_ATTR(rd-req-sent-to-mcu-2,              0x07),
+       XGENE_PMU_EVENT_ATTR(rd-req-sent-to-spec-mcu,           0x08),
+       XGENE_PMU_EVENT_ATTR(rd-req-sent-to-spec-mcu-2,         0x09),
+       XGENE_PMU_EVENT_ATTR(glbl-ack-recv-for-rd-sent-to-spec-mcu, 0x0a),
+       XGENE_PMU_EVENT_ATTR(glbl-ack-go-recv-for-rd-sent-to-spec-mcu, 0x0b),
+       XGENE_PMU_EVENT_ATTR(glbl-ack-nogo-recv-for-rd-sent-to-spec-mcu, 0x0c),
+       XGENE_PMU_EVENT_ATTR(glbl-ack-go-recv-any-rd-req,       0x0d),
+       XGENE_PMU_EVENT_ATTR(glbl-ack-go-recv-any-rd-req-2,     0x0e),
+       XGENE_PMU_EVENT_ATTR(wr-req-sent-to-mcu,                0x0f),
+       XGENE_PMU_EVENT_ATTR(gack-recv,                         0x10),
+       XGENE_PMU_EVENT_ATTR(rd-gack-recv,                      0x11),
+       XGENE_PMU_EVENT_ATTR(wr-gack-recv,                      0x12),
+       XGENE_PMU_EVENT_ATTR(cancel-rd-gack,                    0x13),
+       XGENE_PMU_EVENT_ATTR(cancel-wr-gack,                    0x14),
+       XGENE_PMU_EVENT_ATTR(mcb-csw-req-stall,                 0x15),
+       XGENE_PMU_EVENT_ATTR(mcu-req-intf-blocked,              0x16),
+       XGENE_PMU_EVENT_ATTR(mcb-mcu-rd-intf-stall,             0x17),
+       XGENE_PMU_EVENT_ATTR(csw-rd-intf-blocked,               0x18),
+       XGENE_PMU_EVENT_ATTR(csw-local-ack-intf-blocked,        0x19),
+       XGENE_PMU_EVENT_ATTR(mcu-req-table-full,                0x1a),
+       XGENE_PMU_EVENT_ATTR(mcu-stat-table-full,               0x1b),
+       XGENE_PMU_EVENT_ATTR(mcu-wr-table-full,                 0x1c),
+       XGENE_PMU_EVENT_ATTR(mcu-rdreceipt-resp,                0x1d),
+       XGENE_PMU_EVENT_ATTR(mcu-wrcomplete-resp,               0x1e),
+       XGENE_PMU_EVENT_ATTR(mcu-retryack-resp,                 0x1f),
+       XGENE_PMU_EVENT_ATTR(mcu-pcrdgrant-resp,                0x20),
+       XGENE_PMU_EVENT_ATTR(mcu-req-from-lastload,             0x21),
+       XGENE_PMU_EVENT_ATTR(mcu-req-from-bypass,               0x22),
+       XGENE_PMU_EVENT_ATTR(volt-droop-detect,                 0x23),
+       NULL,
+};
+
+static struct attribute *mc_pmu_v3_events_attrs[] = {
+       XGENE_PMU_EVENT_ATTR(cycle-count,                       0x00),
+       XGENE_PMU_EVENT_ATTR(act-sent,                          0x01),
+       XGENE_PMU_EVENT_ATTR(pre-sent,                          0x02),
+       XGENE_PMU_EVENT_ATTR(rd-sent,                           0x03),
+       XGENE_PMU_EVENT_ATTR(rda-sent,                          0x04),
+       XGENE_PMU_EVENT_ATTR(wr-sent,                           0x05),
+       XGENE_PMU_EVENT_ATTR(wra-sent,                          0x06),
+       XGENE_PMU_EVENT_ATTR(pd-entry-vld,                      0x07),
+       XGENE_PMU_EVENT_ATTR(sref-entry-vld,                    0x08),
+       XGENE_PMU_EVENT_ATTR(prea-sent,                         0x09),
+       XGENE_PMU_EVENT_ATTR(ref-sent,                          0x0a),
+       XGENE_PMU_EVENT_ATTR(rd-rda-sent,                       0x0b),
+       XGENE_PMU_EVENT_ATTR(wr-wra-sent,                       0x0c),
+       XGENE_PMU_EVENT_ATTR(raw-hazard,                        0x0d),
+       XGENE_PMU_EVENT_ATTR(war-hazard,                        0x0e),
+       XGENE_PMU_EVENT_ATTR(waw-hazard,                        0x0f),
+       XGENE_PMU_EVENT_ATTR(rar-hazard,                        0x10),
+       XGENE_PMU_EVENT_ATTR(raw-war-waw-hazard,                0x11),
+       XGENE_PMU_EVENT_ATTR(hprd-lprd-wr-req-vld,              0x12),
+       XGENE_PMU_EVENT_ATTR(lprd-req-vld,                      0x13),
+       XGENE_PMU_EVENT_ATTR(hprd-req-vld,                      0x14),
+       XGENE_PMU_EVENT_ATTR(hprd-lprd-req-vld,                 0x15),
+       XGENE_PMU_EVENT_ATTR(wr-req-vld,                        0x16),
+       XGENE_PMU_EVENT_ATTR(partial-wr-req-vld,                0x17),
+       XGENE_PMU_EVENT_ATTR(rd-retry,                          0x18),
+       XGENE_PMU_EVENT_ATTR(wr-retry,                          0x19),
+       XGENE_PMU_EVENT_ATTR(retry-gnt,                         0x1a),
+       XGENE_PMU_EVENT_ATTR(rank-change,                       0x1b),
+       XGENE_PMU_EVENT_ATTR(dir-change,                        0x1c),
+       XGENE_PMU_EVENT_ATTR(rank-dir-change,                   0x1d),
+       XGENE_PMU_EVENT_ATTR(rank-active,                       0x1e),
+       XGENE_PMU_EVENT_ATTR(rank-idle,                         0x1f),
+       XGENE_PMU_EVENT_ATTR(rank-pd,                           0x20),
+       XGENE_PMU_EVENT_ATTR(rank-sref,                         0x21),
+       XGENE_PMU_EVENT_ATTR(queue-fill-gt-thresh,              0x22),
+       XGENE_PMU_EVENT_ATTR(queue-rds-gt-thresh,               0x23),
+       XGENE_PMU_EVENT_ATTR(queue-wrs-gt-thresh,               0x24),
+       XGENE_PMU_EVENT_ATTR(phy-updt-complt,                   0x25),
+       XGENE_PMU_EVENT_ATTR(tz-fail,                           0x26),
+       XGENE_PMU_EVENT_ATTR(dram-errc,                         0x27),
+       XGENE_PMU_EVENT_ATTR(dram-errd,                         0x28),
+       XGENE_PMU_EVENT_ATTR(rd-enq,                            0x29),
+       XGENE_PMU_EVENT_ATTR(wr-enq,                            0x2a),
+       XGENE_PMU_EVENT_ATTR(tmac-limit-reached,                0x2b),
+       XGENE_PMU_EVENT_ATTR(tmaw-tracker-full,                 0x2c),
+       NULL,
+};
+
+static const struct attribute_group l3c_pmu_v3_events_attr_group = {
+       .name = "events",
+       .attrs = l3c_pmu_v3_events_attrs,
+};
+
+static const struct attribute_group iob_fast_pmu_v3_events_attr_group = {
+       .name = "events",
+       .attrs = iob_fast_pmu_v3_events_attrs,
+};
+
+static const struct attribute_group iob_slow_pmu_v3_events_attr_group = {
+       .name = "events",
+       .attrs = iob_slow_pmu_v3_events_attrs,
+};
+
+static const struct attribute_group mcb_pmu_v3_events_attr_group = {
+       .name = "events",
+       .attrs = mcb_pmu_v3_events_attrs,
+};
+
+static const struct attribute_group mc_pmu_v3_events_attr_group = {
+       .name = "events",
+       .attrs = mc_pmu_v3_events_attrs,
+};
+
 /*
  * sysfs cpumask attributes
  */
@@ -334,7 +633,7 @@ static const struct attribute_group pmu_cpumask_attr_group = {
 };
 
 /*
- * Per PMU device attribute groups
+ * Per PMU device attribute groups of PMU v1 and v2
  */
 static const struct attribute_group *l3c_pmu_attr_groups[] = {
        &l3c_pmu_format_attr_group,
@@ -364,6 +663,44 @@ static const struct attribute_group *mc_pmu_attr_groups[] = {
        NULL
 };
 
+/*
+ * Per PMU device attribute groups of PMU v3
+ */
+static const struct attribute_group *l3c_pmu_v3_attr_groups[] = {
+       &l3c_pmu_v3_format_attr_group,
+       &pmu_cpumask_attr_group,
+       &l3c_pmu_v3_events_attr_group,
+       NULL
+};
+
+static const struct attribute_group *iob_fast_pmu_v3_attr_groups[] = {
+       &iob_pmu_v3_format_attr_group,
+       &pmu_cpumask_attr_group,
+       &iob_fast_pmu_v3_events_attr_group,
+       NULL
+};
+
+static const struct attribute_group *iob_slow_pmu_v3_attr_groups[] = {
+       &iob_slow_pmu_v3_format_attr_group,
+       &pmu_cpumask_attr_group,
+       &iob_slow_pmu_v3_events_attr_group,
+       NULL
+};
+
+static const struct attribute_group *mcb_pmu_v3_attr_groups[] = {
+       &mcb_pmu_v3_format_attr_group,
+       &pmu_cpumask_attr_group,
+       &mcb_pmu_v3_events_attr_group,
+       NULL
+};
+
+static const struct attribute_group *mc_pmu_v3_attr_groups[] = {
+       &mc_pmu_v3_format_attr_group,
+       &pmu_cpumask_attr_group,
+       &mc_pmu_v3_events_attr_group,
+       NULL
+};
+
 static int get_next_avail_cntr(struct xgene_pmu_dev *pmu_dev)
 {
        int cntr;
@@ -387,22 +724,66 @@ static inline void xgene_pmu_mask_int(struct xgene_pmu *xgene_pmu)
        writel(PCPPMU_INTENMASK, xgene_pmu->pcppmu_csr + PCPPMU_INTMASK_REG);
 }
 
+static inline void xgene_pmu_v3_mask_int(struct xgene_pmu *xgene_pmu)
+{
+       writel(PCPPMU_V3_INTENMASK, xgene_pmu->pcppmu_csr + PCPPMU_INTMASK_REG);
+}
+
 static inline void xgene_pmu_unmask_int(struct xgene_pmu *xgene_pmu)
 {
        writel(PCPPMU_INTCLRMASK, xgene_pmu->pcppmu_csr + PCPPMU_INTMASK_REG);
 }
 
-static inline u32 xgene_pmu_read_counter(struct xgene_pmu_dev *pmu_dev, int idx)
+static inline void xgene_pmu_v3_unmask_int(struct xgene_pmu *xgene_pmu)
+{
+       writel(PCPPMU_V3_INTCLRMASK,
+              xgene_pmu->pcppmu_csr + PCPPMU_INTMASK_REG);
+}
+
+static inline u64 xgene_pmu_read_counter32(struct xgene_pmu_dev *pmu_dev,
+                                          int idx)
 {
        return readl(pmu_dev->inf->csr + PMU_PMEVCNTR0 + (4 * idx));
 }
 
+static inline u64 xgene_pmu_read_counter64(struct xgene_pmu_dev *pmu_dev,
+                                          int idx)
+{
+       u32 lo, hi;
+
+       /*
+        * v3 has 64-bit counter registers composed by 2 32-bit registers
+        * This can be a problem if the counter increases and carries
+        * out of bit [31] between 2 reads. The extra reads would help
+        * to prevent this issue.
+        */
+       do {
+               hi = xgene_pmu_read_counter32(pmu_dev, 2 * idx + 1);
+               lo = xgene_pmu_read_counter32(pmu_dev, 2 * idx);
+       } while (hi != xgene_pmu_read_counter32(pmu_dev, 2 * idx + 1));
+
+       return (((u64)hi << 32) | lo);
+}
+
 static inline void
-xgene_pmu_write_counter(struct xgene_pmu_dev *pmu_dev, int idx, u32 val)
+xgene_pmu_write_counter32(struct xgene_pmu_dev *pmu_dev, int idx, u64 val)
 {
        writel(val, pmu_dev->inf->csr + PMU_PMEVCNTR0 + (4 * idx));
 }
 
+static inline void
+xgene_pmu_write_counter64(struct xgene_pmu_dev *pmu_dev, int idx, u64 val)
+{
+       u32 cnt_lo, cnt_hi;
+
+       cnt_hi = upper_32_bits(val);
+       cnt_lo = lower_32_bits(val);
+
+       /* v3 has 64-bit counter registers composed by 2 32-bit registers */
+       xgene_pmu_write_counter32(pmu_dev, 2 * idx, cnt_lo);
+       xgene_pmu_write_counter32(pmu_dev, 2 * idx + 1, cnt_hi);
+}
+
 static inline void
 xgene_pmu_write_evttype(struct xgene_pmu_dev *pmu_dev, int idx, u32 val)
 {
@@ -415,12 +796,18 @@ xgene_pmu_write_agentmsk(struct xgene_pmu_dev *pmu_dev, u32 val)
        writel(val, pmu_dev->inf->csr + PMU_PMAMR0);
 }
 
+static inline void
+xgene_pmu_v3_write_agentmsk(struct xgene_pmu_dev *pmu_dev, u32 val) { }
+
 static inline void
 xgene_pmu_write_agent1msk(struct xgene_pmu_dev *pmu_dev, u32 val)
 {
        writel(val, pmu_dev->inf->csr + PMU_PMAMR1);
 }
 
+static inline void
+xgene_pmu_v3_write_agent1msk(struct xgene_pmu_dev *pmu_dev, u32 val) { }
+
 static inline void
 xgene_pmu_enable_counter(struct xgene_pmu_dev *pmu_dev, int idx)
 {
@@ -491,20 +878,22 @@ static inline void xgene_pmu_stop_counters(struct xgene_pmu_dev *pmu_dev)
 static void xgene_perf_pmu_enable(struct pmu *pmu)
 {
        struct xgene_pmu_dev *pmu_dev = to_pmu_dev(pmu);
+       struct xgene_pmu *xgene_pmu = pmu_dev->parent;
        int enabled = bitmap_weight(pmu_dev->cntr_assign_mask,
                        pmu_dev->max_counters);
 
        if (!enabled)
                return;
 
-       xgene_pmu_start_counters(pmu_dev);
+       xgene_pmu->ops->start_counters(pmu_dev);
 }
 
 static void xgene_perf_pmu_disable(struct pmu *pmu)
 {
        struct xgene_pmu_dev *pmu_dev = to_pmu_dev(pmu);
+       struct xgene_pmu *xgene_pmu = pmu_dev->parent;
 
-       xgene_pmu_stop_counters(pmu_dev);
+       xgene_pmu->ops->stop_counters(pmu_dev);
 }
 
 static int xgene_perf_event_init(struct perf_event *event)
@@ -572,49 +961,56 @@ static int xgene_perf_event_init(struct perf_event *event)
 static void xgene_perf_enable_event(struct perf_event *event)
 {
        struct xgene_pmu_dev *pmu_dev = to_pmu_dev(event->pmu);
+       struct xgene_pmu *xgene_pmu = pmu_dev->parent;
 
-       xgene_pmu_write_evttype(pmu_dev, GET_CNTR(event), GET_EVENTID(event));
-       xgene_pmu_write_agentmsk(pmu_dev, ~((u32)GET_AGENTID(event)));
+       xgene_pmu->ops->write_evttype(pmu_dev, GET_CNTR(event),
+                                     GET_EVENTID(event));
+       xgene_pmu->ops->write_agentmsk(pmu_dev, ~((u32)GET_AGENTID(event)));
        if (pmu_dev->inf->type == PMU_TYPE_IOB)
-               xgene_pmu_write_agent1msk(pmu_dev, ~((u32)GET_AGENT1ID(event)));
+               xgene_pmu->ops->write_agent1msk(pmu_dev,
+                                               ~((u32)GET_AGENT1ID(event)));
 
-       xgene_pmu_enable_counter(pmu_dev, GET_CNTR(event));
-       xgene_pmu_enable_counter_int(pmu_dev, GET_CNTR(event));
+       xgene_pmu->ops->enable_counter(pmu_dev, GET_CNTR(event));
+       xgene_pmu->ops->enable_counter_int(pmu_dev, GET_CNTR(event));
 }
 
 static void xgene_perf_disable_event(struct perf_event *event)
 {
        struct xgene_pmu_dev *pmu_dev = to_pmu_dev(event->pmu);
+       struct xgene_pmu *xgene_pmu = pmu_dev->parent;
 
-       xgene_pmu_disable_counter(pmu_dev, GET_CNTR(event));
-       xgene_pmu_disable_counter_int(pmu_dev, GET_CNTR(event));
+       xgene_pmu->ops->disable_counter(pmu_dev, GET_CNTR(event));
+       xgene_pmu->ops->disable_counter_int(pmu_dev, GET_CNTR(event));
 }
 
 static void xgene_perf_event_set_period(struct perf_event *event)
 {
        struct xgene_pmu_dev *pmu_dev = to_pmu_dev(event->pmu);
+       struct xgene_pmu *xgene_pmu = pmu_dev->parent;
        struct hw_perf_event *hw = &event->hw;
        /*
-        * The X-Gene PMU counters have a period of 2^32. To account for the
-        * possiblity of extreme interrupt latency we program for a period of
-        * half that. Hopefully we can handle the interrupt before another 2^31
+        * For 32 bit counter, it has a period of 2^32. To account for the
+        * possibility of extreme interrupt latency we program for a period of
+        * half that. Hopefully, we can handle the interrupt before another 2^31
         * events occur and the counter overtakes its previous value.
+        * For 64 bit counter, we don't expect it overflow.
         */
        u64 val = 1ULL << 31;
 
        local64_set(&hw->prev_count, val);
-       xgene_pmu_write_counter(pmu_dev, hw->idx, (u32) val);
+       xgene_pmu->ops->write_counter(pmu_dev, hw->idx, val);
 }
 
 static void xgene_perf_event_update(struct perf_event *event)
 {
        struct xgene_pmu_dev *pmu_dev = to_pmu_dev(event->pmu);
+       struct xgene_pmu *xgene_pmu = pmu_dev->parent;
        struct hw_perf_event *hw = &event->hw;
        u64 delta, prev_raw_count, new_raw_count;
 
 again:
        prev_raw_count = local64_read(&hw->prev_count);
-       new_raw_count = xgene_pmu_read_counter(pmu_dev, GET_CNTR(event));
+       new_raw_count = xgene_pmu->ops->read_counter(pmu_dev, GET_CNTR(event));
 
        if (local64_cmpxchg(&hw->prev_count, prev_raw_count,
                            new_raw_count) != prev_raw_count)
@@ -633,6 +1029,7 @@ static void xgene_perf_read(struct perf_event *event)
 static void xgene_perf_start(struct perf_event *event, int flags)
 {
        struct xgene_pmu_dev *pmu_dev = to_pmu_dev(event->pmu);
+       struct xgene_pmu *xgene_pmu = pmu_dev->parent;
        struct hw_perf_event *hw = &event->hw;
 
        if (WARN_ON_ONCE(!(hw->state & PERF_HES_STOPPED)))
@@ -646,8 +1043,8 @@ static void xgene_perf_start(struct perf_event *event, int flags)
        if (flags & PERF_EF_RELOAD) {
                u64 prev_raw_count =  local64_read(&hw->prev_count);
 
-               xgene_pmu_write_counter(pmu_dev, GET_CNTR(event),
-                                       (u32) prev_raw_count);
+               xgene_pmu->ops->write_counter(pmu_dev, GET_CNTR(event),
+                                             prev_raw_count);
        }
 
        xgene_perf_enable_event(event);
@@ -713,7 +1110,10 @@ static int xgene_init_perf(struct xgene_pmu_dev *pmu_dev, char *name)
 {
        struct xgene_pmu *xgene_pmu;
 
-       pmu_dev->max_period = PMU_CNT_MAX_PERIOD - 1;
+       if (pmu_dev->parent->version == PCP_PMU_V3)
+               pmu_dev->max_period = PMU_V3_CNT_MAX_PERIOD;
+       else
+               pmu_dev->max_period = PMU_CNT_MAX_PERIOD;
        /* First version PMU supports only single event counter */
        xgene_pmu = pmu_dev->parent;
        if (xgene_pmu->version == PCP_PMU_V1)
@@ -736,8 +1136,8 @@ static int xgene_init_perf(struct xgene_pmu_dev *pmu_dev, char *name)
        };
 
        /* Hardware counter init */
-       xgene_pmu_stop_counters(pmu_dev);
-       xgene_pmu_reset_counters(pmu_dev);
+       xgene_pmu->ops->stop_counters(pmu_dev);
+       xgene_pmu->ops->reset_counters(pmu_dev);
 
        return perf_pmu_register(&pmu_dev->pmu, name, -1);
 }
@@ -758,20 +1158,38 @@ xgene_pmu_dev_add(struct xgene_pmu *xgene_pmu, struct xgene_pmu_dev_ctx *ctx)
 
        switch (pmu->inf->type) {
        case PMU_TYPE_L3C:
-               pmu->attr_groups = l3c_pmu_attr_groups;
+               if (!(xgene_pmu->l3c_active_mask & pmu->inf->enable_mask))
+                       goto dev_err;
+               if (xgene_pmu->version == PCP_PMU_V3)
+                       pmu->attr_groups = l3c_pmu_v3_attr_groups;
+               else
+                       pmu->attr_groups = l3c_pmu_attr_groups;
                break;
        case PMU_TYPE_IOB:
-               pmu->attr_groups = iob_pmu_attr_groups;
+               if (xgene_pmu->version == PCP_PMU_V3)
+                       pmu->attr_groups = iob_fast_pmu_v3_attr_groups;
+               else
+                       pmu->attr_groups = iob_pmu_attr_groups;
+               break;
+       case PMU_TYPE_IOB_SLOW:
+               if (xgene_pmu->version == PCP_PMU_V3)
+                       pmu->attr_groups = iob_slow_pmu_v3_attr_groups;
                break;
        case PMU_TYPE_MCB:
                if (!(xgene_pmu->mcb_active_mask & pmu->inf->enable_mask))
                        goto dev_err;
-               pmu->attr_groups = mcb_pmu_attr_groups;
+               if (xgene_pmu->version == PCP_PMU_V3)
+                       pmu->attr_groups = mcb_pmu_v3_attr_groups;
+               else
+                       pmu->attr_groups = mcb_pmu_attr_groups;
                break;
        case PMU_TYPE_MC:
                if (!(xgene_pmu->mc_active_mask & pmu->inf->enable_mask))
                        goto dev_err;
-               pmu->attr_groups = mc_pmu_attr_groups;
+               if (xgene_pmu->version == PCP_PMU_V3)
+                       pmu->attr_groups = mc_pmu_v3_attr_groups;
+               else
+                       pmu->attr_groups = mc_pmu_attr_groups;
                break;
        default:
                return -EINVAL;
@@ -795,18 +1213,27 @@ dev_err:
 static void _xgene_pmu_isr(int irq, struct xgene_pmu_dev *pmu_dev)
 {
        struct xgene_pmu *xgene_pmu = pmu_dev->parent;
+       void __iomem *csr = pmu_dev->inf->csr;
        u32 pmovsr;
        int idx;
 
-       pmovsr = readl(pmu_dev->inf->csr + PMU_PMOVSR) & PMU_OVERFLOW_MASK;
+       xgene_pmu->ops->stop_counters(pmu_dev);
+
+       if (xgene_pmu->version == PCP_PMU_V3)
+               pmovsr = readl(csr + PMU_PMOVSSET) & PMU_OVERFLOW_MASK;
+       else
+               pmovsr = readl(csr + PMU_PMOVSR) & PMU_OVERFLOW_MASK;
+
        if (!pmovsr)
-               return;
+               goto out;
 
        /* Clear interrupt flag */
        if (xgene_pmu->version == PCP_PMU_V1)
-               writel(0x0, pmu_dev->inf->csr + PMU_PMOVSR);
+               writel(0x0, csr + PMU_PMOVSR);
+       else if (xgene_pmu->version == PCP_PMU_V2)
+               writel(pmovsr, csr + PMU_PMOVSR);
        else
-               writel(pmovsr, pmu_dev->inf->csr + PMU_PMOVSR);
+               writel(pmovsr, csr + PMU_PMOVSCLR);
 
        for (idx = 0; idx < PMU_MAX_COUNTERS; idx++) {
                struct perf_event *event = pmu_dev->pmu_counter_event[idx];
@@ -818,10 +1245,14 @@ static void _xgene_pmu_isr(int irq, struct xgene_pmu_dev *pmu_dev)
                xgene_perf_event_update(event);
                xgene_perf_event_set_period(event);
        }
+
+out:
+       xgene_pmu->ops->start_counters(pmu_dev);
 }
 
 static irqreturn_t xgene_pmu_isr(int irq, void *dev_id)
 {
+       u32 intr_mcu, intr_mcb, intr_l3c, intr_iob;
        struct xgene_pmu_dev_ctx *ctx;
        struct xgene_pmu *xgene_pmu = dev_id;
        unsigned long flags;
@@ -831,22 +1262,33 @@ static irqreturn_t xgene_pmu_isr(int irq, void *dev_id)
 
        /* Get Interrupt PMU source */
        val = readl(xgene_pmu->pcppmu_csr + PCPPMU_INTSTATUS_REG);
-       if (val & PCPPMU_INT_MCU) {
+       if (xgene_pmu->version == PCP_PMU_V3) {
+               intr_mcu = PCPPMU_V3_INT_MCU;
+               intr_mcb = PCPPMU_V3_INT_MCB;
+               intr_l3c = PCPPMU_V3_INT_L3C;
+               intr_iob = PCPPMU_V3_INT_IOB;
+       } else {
+               intr_mcu = PCPPMU_INT_MCU;
+               intr_mcb = PCPPMU_INT_MCB;
+               intr_l3c = PCPPMU_INT_L3C;
+               intr_iob = PCPPMU_INT_IOB;
+       }
+       if (val & intr_mcu) {
                list_for_each_entry(ctx, &xgene_pmu->mcpmus, next) {
                        _xgene_pmu_isr(irq, ctx->pmu_dev);
                }
        }
-       if (val & PCPPMU_INT_MCB) {
+       if (val & intr_mcb) {
                list_for_each_entry(ctx, &xgene_pmu->mcbpmus, next) {
                        _xgene_pmu_isr(irq, ctx->pmu_dev);
                }
        }
-       if (val & PCPPMU_INT_L3C) {
+       if (val & intr_l3c) {
                list_for_each_entry(ctx, &xgene_pmu->l3cpmus, next) {
                        _xgene_pmu_isr(irq, ctx->pmu_dev);
                }
        }
-       if (val & PCPPMU_INT_IOB) {
+       if (val & intr_iob) {
                list_for_each_entry(ctx, &xgene_pmu->iobpmus, next) {
                        _xgene_pmu_isr(irq, ctx->pmu_dev);
                }
@@ -857,8 +1299,8 @@ static irqreturn_t xgene_pmu_isr(int irq, void *dev_id)
        return IRQ_HANDLED;
 }
 
-static int acpi_pmu_probe_active_mcb_mcu(struct xgene_pmu *xgene_pmu,
-                                        struct platform_device *pdev)
+static int acpi_pmu_probe_active_mcb_mcu_l3c(struct xgene_pmu *xgene_pmu,
+                                            struct platform_device *pdev)
 {
        void __iomem *csw_csr, *mcba_csr, *mcbb_csr;
        struct resource *res;
@@ -885,6 +1327,8 @@ static int acpi_pmu_probe_active_mcb_mcu(struct xgene_pmu *xgene_pmu,
                return PTR_ERR(mcbb_csr);
        }
 
+       xgene_pmu->l3c_active_mask = 0x1;
+
        reg = readl(csw_csr + CSW_CSWCR);
        if (reg & CSW_CSWCR_DUALMCB_MASK) {
                /* Dual MCB active */
@@ -905,8 +1349,56 @@ static int acpi_pmu_probe_active_mcb_mcu(struct xgene_pmu *xgene_pmu,
        return 0;
 }
 
-static int fdt_pmu_probe_active_mcb_mcu(struct xgene_pmu *xgene_pmu,
-                                       struct platform_device *pdev)
+static int acpi_pmu_v3_probe_active_mcb_mcu_l3c(struct xgene_pmu *xgene_pmu,
+                                               struct platform_device *pdev)
+{
+       void __iomem *csw_csr;
+       struct resource *res;
+       unsigned int reg;
+       u32 mcb0routing;
+       u32 mcb1routing;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+       csw_csr = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(csw_csr)) {
+               dev_err(&pdev->dev, "ioremap failed for CSW CSR resource\n");
+               return PTR_ERR(csw_csr);
+       }
+
+       reg = readl(csw_csr + CSW_CSWCR);
+       mcb0routing = CSW_CSWCR_MCB0_ROUTING(reg);
+       mcb1routing = CSW_CSWCR_MCB1_ROUTING(reg);
+       if (reg & CSW_CSWCR_DUALMCB_MASK) {
+               /* Dual MCB active */
+               xgene_pmu->mcb_active_mask = 0x3;
+               /* Probe all active L3C(s), maximum is 8 */
+               xgene_pmu->l3c_active_mask = 0xFF;
+               /* Probe all active MC(s), maximum is 8 */
+               if ((mcb0routing == 0x2) && (mcb1routing == 0x2))
+                       xgene_pmu->mc_active_mask = 0xFF;
+               else if ((mcb0routing == 0x1) && (mcb1routing == 0x1))
+                       xgene_pmu->mc_active_mask =  0x33;
+               else
+                       xgene_pmu->mc_active_mask =  0x11;
+       } else {
+               /* Single MCB active */
+               xgene_pmu->mcb_active_mask = 0x1;
+               /* Probe all active L3C(s), maximum is 4 */
+               xgene_pmu->l3c_active_mask = 0x0F;
+               /* Probe all active MC(s), maximum is 4 */
+               if (mcb0routing == 0x2)
+                       xgene_pmu->mc_active_mask = 0x0F;
+               else if (mcb0routing == 0x1)
+                       xgene_pmu->mc_active_mask =  0x03;
+               else
+                       xgene_pmu->mc_active_mask =  0x01;
+       }
+
+       return 0;
+}
+
+static int fdt_pmu_probe_active_mcb_mcu_l3c(struct xgene_pmu *xgene_pmu,
+                                           struct platform_device *pdev)
 {
        struct regmap *csw_map, *mcba_map, *mcbb_map;
        struct device_node *np = pdev->dev.of_node;
@@ -930,6 +1422,7 @@ static int fdt_pmu_probe_active_mcb_mcu(struct xgene_pmu *xgene_pmu,
                return PTR_ERR(mcbb_map);
        }
 
+       xgene_pmu->l3c_active_mask = 0x1;
        if (regmap_read(csw_map, CSW_CSWCR, &reg))
                return -EINVAL;
 
@@ -954,12 +1447,18 @@ static int fdt_pmu_probe_active_mcb_mcu(struct xgene_pmu *xgene_pmu,
        return 0;
 }
 
-static int xgene_pmu_probe_active_mcb_mcu(struct xgene_pmu *xgene_pmu,
-                                         struct platform_device *pdev)
+static int xgene_pmu_probe_active_mcb_mcu_l3c(struct xgene_pmu *xgene_pmu,
+                                             struct platform_device *pdev)
 {
-       if (has_acpi_companion(&pdev->dev))
-               return acpi_pmu_probe_active_mcb_mcu(xgene_pmu, pdev);
-       return fdt_pmu_probe_active_mcb_mcu(xgene_pmu, pdev);
+       if (has_acpi_companion(&pdev->dev)) {
+               if (xgene_pmu->version == PCP_PMU_V3)
+                       return acpi_pmu_v3_probe_active_mcb_mcu_l3c(xgene_pmu,
+                                                                   pdev);
+               else
+                       return acpi_pmu_probe_active_mcb_mcu_l3c(xgene_pmu,
+                                                                pdev);
+       }
+       return fdt_pmu_probe_active_mcb_mcu_l3c(xgene_pmu, pdev);
 }
 
 static char *xgene_pmu_dev_name(struct device *dev, u32 type, int id)
@@ -969,6 +1468,8 @@ static char *xgene_pmu_dev_name(struct device *dev, u32 type, int id)
                return devm_kasprintf(dev, GFP_KERNEL, "l3c%d", id);
        case PMU_TYPE_IOB:
                return devm_kasprintf(dev, GFP_KERNEL, "iob%d", id);
+       case PMU_TYPE_IOB_SLOW:
+               return devm_kasprintf(dev, GFP_KERNEL, "iob-slow%d", id);
        case PMU_TYPE_MCB:
                return devm_kasprintf(dev, GFP_KERNEL, "mcb%d", id);
        case PMU_TYPE_MC:
@@ -1047,9 +1548,40 @@ err:
        return NULL;
 }
 
+static const struct acpi_device_id xgene_pmu_acpi_type_match[] = {
+       {"APMC0D5D", PMU_TYPE_L3C},
+       {"APMC0D5E", PMU_TYPE_IOB},
+       {"APMC0D5F", PMU_TYPE_MCB},
+       {"APMC0D60", PMU_TYPE_MC},
+       {"APMC0D84", PMU_TYPE_L3C},
+       {"APMC0D85", PMU_TYPE_IOB},
+       {"APMC0D86", PMU_TYPE_IOB_SLOW},
+       {"APMC0D87", PMU_TYPE_MCB},
+       {"APMC0D88", PMU_TYPE_MC},
+       {},
+};
+
+static const struct acpi_device_id *xgene_pmu_acpi_match_type(
+                                       const struct acpi_device_id *ids,
+                                       struct acpi_device *adev)
+{
+       const struct acpi_device_id *match_id = NULL;
+       const struct acpi_device_id *id;
+
+       for (id = ids; id->id[0] || id->cls; id++) {
+               if (!acpi_match_device_ids(adev, id))
+                       match_id = id;
+               else if (match_id)
+                       break;
+       }
+
+       return match_id;
+}
+
 static acpi_status acpi_pmu_dev_add(acpi_handle handle, u32 level,
                                    void *data, void **return_value)
 {
+       const struct acpi_device_id *acpi_id;
        struct xgene_pmu *xgene_pmu = data;
        struct xgene_pmu_dev_ctx *ctx;
        struct acpi_device *adev;
@@ -1059,17 +1591,11 @@ static acpi_status acpi_pmu_dev_add(acpi_handle handle, u32 level,
        if (acpi_bus_get_status(adev) || !adev->status.present)
                return AE_OK;
 
-       if (!strcmp(acpi_device_hid(adev), "APMC0D5D"))
-               ctx = acpi_get_pmu_hw_inf(xgene_pmu, adev, PMU_TYPE_L3C);
-       else if (!strcmp(acpi_device_hid(adev), "APMC0D5E"))
-               ctx = acpi_get_pmu_hw_inf(xgene_pmu, adev, PMU_TYPE_IOB);
-       else if (!strcmp(acpi_device_hid(adev), "APMC0D5F"))
-               ctx = acpi_get_pmu_hw_inf(xgene_pmu, adev, PMU_TYPE_MCB);
-       else if (!strcmp(acpi_device_hid(adev), "APMC0D60"))
-               ctx = acpi_get_pmu_hw_inf(xgene_pmu, adev, PMU_TYPE_MC);
-       else
-               ctx = NULL;
+       acpi_id = xgene_pmu_acpi_match_type(xgene_pmu_acpi_type_match, adev);
+       if (!acpi_id)
+               return AE_OK;
 
+       ctx = acpi_get_pmu_hw_inf(xgene_pmu, adev, (u32)acpi_id->driver_data);
        if (!ctx)
                return AE_OK;
 
@@ -1086,6 +1612,9 @@ static acpi_status acpi_pmu_dev_add(acpi_handle handle, u32 level,
        case PMU_TYPE_IOB:
                list_add(&ctx->next, &xgene_pmu->iobpmus);
                break;
+       case PMU_TYPE_IOB_SLOW:
+               list_add(&ctx->next, &xgene_pmu->iobpmus);
+               break;
        case PMU_TYPE_MCB:
                list_add(&ctx->next, &xgene_pmu->mcbpmus);
                break;
@@ -1207,6 +1736,9 @@ static int fdt_pmu_probe_pmu_dev(struct xgene_pmu *xgene_pmu,
                case PMU_TYPE_IOB:
                        list_add(&ctx->next, &xgene_pmu->iobpmus);
                        break;
+               case PMU_TYPE_IOB_SLOW:
+                       list_add(&ctx->next, &xgene_pmu->iobpmus);
+                       break;
                case PMU_TYPE_MCB:
                        list_add(&ctx->next, &xgene_pmu->mcbpmus);
                        break;
@@ -1235,6 +1767,40 @@ static const struct xgene_pmu_data xgene_pmu_v2_data = {
        .id   = PCP_PMU_V2,
 };
 
+static const struct xgene_pmu_ops xgene_pmu_ops = {
+       .mask_int = xgene_pmu_mask_int,
+       .unmask_int = xgene_pmu_unmask_int,
+       .read_counter = xgene_pmu_read_counter32,
+       .write_counter = xgene_pmu_write_counter32,
+       .write_evttype = xgene_pmu_write_evttype,
+       .write_agentmsk = xgene_pmu_write_agentmsk,
+       .write_agent1msk = xgene_pmu_write_agent1msk,
+       .enable_counter = xgene_pmu_enable_counter,
+       .disable_counter = xgene_pmu_disable_counter,
+       .enable_counter_int = xgene_pmu_enable_counter_int,
+       .disable_counter_int = xgene_pmu_disable_counter_int,
+       .reset_counters = xgene_pmu_reset_counters,
+       .start_counters = xgene_pmu_start_counters,
+       .stop_counters = xgene_pmu_stop_counters,
+};
+
+static const struct xgene_pmu_ops xgene_pmu_v3_ops = {
+       .mask_int = xgene_pmu_v3_mask_int,
+       .unmask_int = xgene_pmu_v3_unmask_int,
+       .read_counter = xgene_pmu_read_counter64,
+       .write_counter = xgene_pmu_write_counter64,
+       .write_evttype = xgene_pmu_write_evttype,
+       .write_agentmsk = xgene_pmu_v3_write_agentmsk,
+       .write_agent1msk = xgene_pmu_v3_write_agent1msk,
+       .enable_counter = xgene_pmu_enable_counter,
+       .disable_counter = xgene_pmu_disable_counter,
+       .enable_counter_int = xgene_pmu_enable_counter_int,
+       .disable_counter_int = xgene_pmu_disable_counter_int,
+       .reset_counters = xgene_pmu_reset_counters,
+       .start_counters = xgene_pmu_start_counters,
+       .stop_counters = xgene_pmu_stop_counters,
+};
+
 static const struct of_device_id xgene_pmu_of_match[] = {
        { .compatible   = "apm,xgene-pmu",      .data = &xgene_pmu_data },
        { .compatible   = "apm,xgene-pmu-v2",   .data = &xgene_pmu_v2_data },
@@ -1245,6 +1811,7 @@ MODULE_DEVICE_TABLE(of, xgene_pmu_of_match);
 static const struct acpi_device_id xgene_pmu_acpi_match[] = {
        {"APMC0D5B", PCP_PMU_V1},
        {"APMC0D5C", PCP_PMU_V2},
+       {"APMC0D83", PCP_PMU_V3},
        {},
 };
 MODULE_DEVICE_TABLE(acpi, xgene_pmu_acpi_match);
@@ -1284,6 +1851,11 @@ static int xgene_pmu_probe(struct platform_device *pdev)
        if (version < 0)
                return -ENODEV;
 
+       if (version == PCP_PMU_V3)
+               xgene_pmu->ops = &xgene_pmu_v3_ops;
+       else
+               xgene_pmu->ops = &xgene_pmu_ops;
+
        INIT_LIST_HEAD(&xgene_pmu->l3cpmus);
        INIT_LIST_HEAD(&xgene_pmu->iobpmus);
        INIT_LIST_HEAD(&xgene_pmu->mcbpmus);
@@ -1317,7 +1889,7 @@ static int xgene_pmu_probe(struct platform_device *pdev)
        raw_spin_lock_init(&xgene_pmu->lock);
 
        /* Check for active MCBs and MCUs */
-       rc = xgene_pmu_probe_active_mcb_mcu(xgene_pmu, pdev);
+       rc = xgene_pmu_probe_active_mcb_mcu_l3c(xgene_pmu, pdev);
        if (rc) {
                dev_warn(&pdev->dev, "Unknown MCB/MCU active status\n");
                xgene_pmu->mcb_active_mask = 0x1;
@@ -1342,7 +1914,7 @@ static int xgene_pmu_probe(struct platform_device *pdev)
        }
 
        /* Enable interrupt */
-       xgene_pmu_unmask_int(xgene_pmu);
+       xgene_pmu->ops->unmask_int(xgene_pmu);
 
        return 0;
 
index ed4c343..5429d37 100644 (file)
@@ -7,11 +7,24 @@
 
 #include <linux/init.h>
 #include <linux/ras.h>
+#include <linux/uuid.h>
 
 #define CREATE_TRACE_POINTS
 #define TRACE_INCLUDE_PATH ../../include/ras
 #include <ras/ras_event.h>
 
+void log_non_standard_event(const uuid_le *sec_type, const uuid_le *fru_id,
+                           const char *fru_text, const u8 sev, const u8 *err,
+                           const u32 len)
+{
+       trace_non_standard_event(sec_type, fru_id, fru_text, sev, err, len);
+}
+
+void log_arm_hw_error(struct cper_sec_proc_arm *err)
+{
+       trace_arm_event(err);
+}
+
 static int __init ras_init(void)
 {
        int rc = 0;
@@ -27,7 +40,8 @@ subsys_initcall(ras_init);
 EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
 #endif
 EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
-
+EXPORT_TRACEPOINT_SYMBOL_GPL(non_standard_event);
+EXPORT_TRACEPOINT_SYMBOL_GPL(arm_event);
 
 static int __init parse_ras_param(char *str)
 {
index 4ee5527..45629f4 100644 (file)
@@ -504,7 +504,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
                if (&m->list == &kclist_head) {
                        if (clear_user(buffer, tsz))
                                return -EFAULT;
-               } else if (is_vmalloc_or_module_addr((void *)start)) {
+               } else if (m->type == KCORE_VMALLOC) {
                        vread(buf, (char *)start, tsz);
                        /* we have to zero-fill user buffer even if no read */
                        if (copy_to_user(buffer, buf, tsz))
index 720446c..9f26e01 100644 (file)
@@ -1,3 +1,6 @@
+#ifndef GHES_H
+#define GHES_H
+
 #include <acpi/apei.h>
 #include <acpi/hed.h>
 
 #define GHES_EXITING           0x0002
 
 struct ghes {
-       struct acpi_hest_generic *generic;
+       union {
+               struct acpi_hest_generic *generic;
+               struct acpi_hest_generic_v2 *generic_v2;
+       };
        struct acpi_hest_generic_status *estatus;
        u64 buffer_paddr;
        unsigned long flags;
@@ -70,3 +76,43 @@ static inline void ghes_edac_unregister(struct ghes *ghes)
 {
 }
 #endif
+
+static inline int acpi_hest_get_version(struct acpi_hest_generic_data *gdata)
+{
+       return gdata->revision >> 8;
+}
+
+static inline void *acpi_hest_get_payload(struct acpi_hest_generic_data *gdata)
+{
+       if (acpi_hest_get_version(gdata) >= 3)
+               return (void *)(((struct acpi_hest_generic_data_v300 *)(gdata)) + 1);
+
+       return gdata + 1;
+}
+
+static inline int acpi_hest_get_error_length(struct acpi_hest_generic_data *gdata)
+{
+       return ((struct acpi_hest_generic_data *)(gdata))->error_data_length;
+}
+
+static inline int acpi_hest_get_size(struct acpi_hest_generic_data *gdata)
+{
+       if (acpi_hest_get_version(gdata) >= 3)
+               return sizeof(struct acpi_hest_generic_data_v300);
+
+       return sizeof(struct acpi_hest_generic_data);
+}
+
+static inline int acpi_hest_get_record_size(struct acpi_hest_generic_data *gdata)
+{
+       return (acpi_hest_get_size(gdata) + acpi_hest_get_error_length(gdata));
+}
+
+static inline void *acpi_hest_get_next(struct acpi_hest_generic_data *gdata)
+{
+       return (void *)(gdata) + acpi_hest_get_record_size(gdata);
+}
+
+int ghes_notify_sea(void);
+
+#endif /* GHES_H */
index 3ff9ace..8379d40 100644 (file)
@@ -31,7 +31,6 @@ void iort_deregister_domain_token(int trans_id);
 struct fwnode_handle *iort_find_domain_token(int trans_id);
 #ifdef CONFIG_ACPI_IORT
 void acpi_iort_init(void);
-bool iort_node_match(u8 type);
 u32 iort_msi_map_rid(struct device *dev, u32 req_id);
 struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id);
 void acpi_configure_pmsi_domain(struct device *dev);
@@ -41,7 +40,6 @@ void iort_set_dma_mask(struct device *dev);
 const struct iommu_ops *iort_iommu_configure(struct device *dev);
 #else
 static inline void acpi_iort_init(void) { }
-static inline bool iort_node_match(u8 type) { return false; }
 static inline u32 iort_msi_map_rid(struct device *dev, u32 req_id)
 { return req_id; }
 static inline struct irq_domain *iort_get_device_domain(struct device *dev,
index dcacb1a..4c671fc 100644 (file)
@@ -180,6 +180,10 @@ enum {
 #define CPER_SEC_PROC_IPF                                              \
        UUID_LE(0xE429FAF1, 0x3CB7, 0x11D4, 0x0B, 0xCA, 0x07, 0x00,     \
                0x80, 0xC7, 0x3C, 0x88, 0x81)
+/* Processor Specific: ARM */
+#define CPER_SEC_PROC_ARM                                              \
+       UUID_LE(0xE19E3D16, 0xBC11, 0x11E4, 0x9C, 0xAA, 0xC2, 0x05,     \
+               0x1D, 0x5D, 0x46, 0xB0)
 /* Platform Memory */
 #define CPER_SEC_PLATFORM_MEM                                          \
        UUID_LE(0xA5BC1114, 0x6F64, 0x4EDE, 0xB8, 0x63, 0x3E, 0x83,     \
@@ -255,6 +259,22 @@ enum {
 
 #define CPER_PCIE_SLOT_SHIFT                   3
 
+#define CPER_ARM_VALID_MPIDR                   BIT(0)
+#define CPER_ARM_VALID_AFFINITY_LEVEL          BIT(1)
+#define CPER_ARM_VALID_RUNNING_STATE           BIT(2)
+#define CPER_ARM_VALID_VENDOR_INFO             BIT(3)
+
+#define CPER_ARM_INFO_VALID_MULTI_ERR          BIT(0)
+#define CPER_ARM_INFO_VALID_FLAGS              BIT(1)
+#define CPER_ARM_INFO_VALID_ERR_INFO           BIT(2)
+#define CPER_ARM_INFO_VALID_VIRT_ADDR          BIT(3)
+#define CPER_ARM_INFO_VALID_PHYSICAL_ADDR      BIT(4)
+
+#define CPER_ARM_INFO_FLAGS_FIRST              BIT(0)
+#define CPER_ARM_INFO_FLAGS_LAST               BIT(1)
+#define CPER_ARM_INFO_FLAGS_PROPAGATED         BIT(2)
+#define CPER_ARM_INFO_FLAGS_OVERFLOW           BIT(3)
+
 /*
  * All tables and structs must be byte-packed to match CPER
  * specification, since the tables are provided by the system BIOS
@@ -340,6 +360,40 @@ struct cper_ia_proc_ctx {
        __u64   mm_reg_addr;
 };
 
+/* ARM Processor Error Section */
+struct cper_sec_proc_arm {
+       __u32   validation_bits;
+       __u16   err_info_num;           /* Number of Processor Error Info */
+       __u16   context_info_num;       /* Number of Processor Context Info Records*/
+       __u32   section_length;
+       __u8    affinity_level;
+       __u8    reserved[3];            /* must be zero */
+       __u64   mpidr;
+       __u64   midr;
+       __u32   running_state;          /* Bit 0 set - Processor running. PSCI = 0 */
+       __u32   psci_state;
+};
+
+/* ARM Processor Error Information Structure */
+struct cper_arm_err_info {
+       __u8    version;
+       __u8    length;
+       __u16   validation_bits;
+       __u8    type;
+       __u16   multiple_error;
+       __u8    flags;
+       __u64   error_info;
+       __u64   virt_fault_addr;
+       __u64   physical_fault_addr;
+};
+
+/* ARM Processor Context Information Structure */
+struct cper_arm_ctx_info {
+       __u16   version;
+       __u16   type;
+       __u32   size;
+};
+
 /* Old Memory Error Section UEFI 2.1, 2.2 */
 struct cper_sec_mem_err_old {
        __u64   validation_bits;
index ffb1471..be5338a 100644 (file)
@@ -2,6 +2,8 @@
 #define __RAS_H__
 
 #include <asm/errno.h>
+#include <linux/uuid.h>
+#include <linux/cper.h>
 
 #ifdef CONFIG_DEBUG_FS
 int ras_userspace_consumers(void);
@@ -22,4 +24,19 @@ static inline void __init cec_init(void)     { }
 static inline int cec_add_elem(u64 pfn)                { return -ENODEV; }
 #endif
 
+#ifdef CONFIG_RAS
+void log_non_standard_event(const guid_t *sec_type,
+                           const guid_t *fru_id, const char *fru_text,
+                           const u8 sev, const u8 *err, const u32 len);
+void log_arm_hw_error(struct cper_sec_proc_arm *err);
+#else
+static inline void
+log_non_standard_event(const guid_t *sec_type,
+                      const guid_t *fru_id, const char *fru_text,
+                      const u8 sev, const u8 *err, const u32 len)
+{ return; }
+static inline void
+log_arm_hw_error(struct cper_sec_proc_arm *err) { return; }
+#endif
+
 #endif /* __RAS_H__ */
index d1defe4..2251e19 100644 (file)
 
 #include <uapi/linux/uuid.h>
 
+#define UUID_SIZE 16
+
 typedef struct {
-       __u8 b[16];
+       __u8 b[UUID_SIZE];
 } uuid_t;
 
 #define UUID_INIT(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)                     \
index 1791a12..429f46f 100644 (file)
@@ -161,6 +161,96 @@ TRACE_EVENT(mc_event,
                  __get_str(driver_detail))
 );
 
+/*
+ * ARM Processor Events Report
+ *
+ * This event is generated when hardware detects an ARM processor error
+ * has occurred. UEFI 2.6 spec section N.2.4.4.
+ */
+TRACE_EVENT(arm_event,
+
+       TP_PROTO(const struct cper_sec_proc_arm *proc),
+
+       TP_ARGS(proc),
+
+       TP_STRUCT__entry(
+               __field(u64, mpidr)
+               __field(u64, midr)
+               __field(u32, running_state)
+               __field(u32, psci_state)
+               __field(u8, affinity)
+       ),
+
+       TP_fast_assign(
+               if (proc->validation_bits & CPER_ARM_VALID_AFFINITY_LEVEL)
+                       __entry->affinity = proc->affinity_level;
+               else
+                       __entry->affinity = ~0;
+               if (proc->validation_bits & CPER_ARM_VALID_MPIDR)
+                       __entry->mpidr = proc->mpidr;
+               else
+                       __entry->mpidr = 0ULL;
+               __entry->midr = proc->midr;
+               if (proc->validation_bits & CPER_ARM_VALID_RUNNING_STATE) {
+                       __entry->running_state = proc->running_state;
+                       __entry->psci_state = proc->psci_state;
+               } else {
+                       __entry->running_state = ~0;
+                       __entry->psci_state = ~0;
+               }
+       ),
+
+       TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
+                 "running state: %d; PSCI state: %d",
+                 __entry->affinity, __entry->mpidr, __entry->midr,
+                 __entry->running_state, __entry->psci_state)
+);
+
+/*
+ * Non-Standard Section Report
+ *
+ * This event is generated when hardware detected a hardware
+ * error event, which may be of non-standard section as defined
+ * in UEFI spec appendix "Common Platform Error Record", or may
+ * be of sections for which TRACE_EVENT is not defined.
+ *
+ */
+TRACE_EVENT(non_standard_event,
+
+       TP_PROTO(const uuid_le *sec_type,
+                const uuid_le *fru_id,
+                const char *fru_text,
+                const u8 sev,
+                const u8 *err,
+                const u32 len),
+
+       TP_ARGS(sec_type, fru_id, fru_text, sev, err, len),
+
+       TP_STRUCT__entry(
+               __array(char, sec_type, UUID_SIZE)
+               __array(char, fru_id, UUID_SIZE)
+               __string(fru_text, fru_text)
+               __field(u8, sev)
+               __field(u32, len)
+               __dynamic_array(u8, buf, len)
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->sec_type, sec_type, UUID_SIZE);
+               memcpy(__entry->fru_id, fru_id, UUID_SIZE);
+               __assign_str(fru_text, fru_text);
+               __entry->sev = sev;
+               __entry->len = len;
+               memcpy(__get_dynamic_array(buf), err, len);
+       ),
+
+       TP_printk("severity: %d; sec type:%pU; FRU: %pU %s; data len:%d; raw data:%s",
+                 __entry->sev, __entry->sec_type,
+                 __entry->fru_id, __get_str(fru_text),
+                 __entry->len,
+                 __print_hex(__get_dynamic_array(buf), __entry->len))
+);
+
 /*
  * PCIe AER Trace event
  *
index e2e5eff..1c44aa3 100644 (file)
@@ -29,6 +29,7 @@
 #include <asm/kvm_asm.h>
 #include <asm/kvm_emulate.h>
 #include <asm/virt.h>
+#include <asm/system_misc.h>
 
 #include "trace.h"
 
@@ -1430,6 +1431,25 @@ out:
                kvm_set_pfn_accessed(pfn);
 }
 
+static bool is_abort_sea(unsigned long fault_status)
+{
+       switch (fault_status) {
+       case FSC_SEA:
+       case FSC_SEA_TTW0:
+       case FSC_SEA_TTW1:
+       case FSC_SEA_TTW2:
+       case FSC_SEA_TTW3:
+       case FSC_SECC:
+       case FSC_SECC_TTW0:
+       case FSC_SECC_TTW1:
+       case FSC_SECC_TTW2:
+       case FSC_SECC_TTW3:
+               return true;
+       default:
+               return false;
+       }
+}
+
 /**
  * kvm_handle_guest_abort - handles all 2nd stage aborts
  * @vcpu:      the VCPU pointer
@@ -1452,19 +1472,29 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
        gfn_t gfn;
        int ret, idx;
 
+       fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
+
+       fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
+
+       /*
+        * The host kernel will handle the synchronous external abort. There
+        * is no need to pass the error into the guest.
+        */
+       if (is_abort_sea(fault_status)) {
+               if (!handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
+                       return 1;
+       }
+
        is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
        if (unlikely(!is_iabt && kvm_vcpu_dabt_isextabt(vcpu))) {
                kvm_inject_vabt(vcpu);
                return 1;
        }
 
-       fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
-
        trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
                              kvm_vcpu_get_hfar(vcpu), fault_ipa);
 
        /* Check the stage-2 fault is trans. fault or write fault */
-       fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
        if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
            fault_status != FSC_ACCESS) {
                kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",