Merge tag 'powerpc-5.12-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 22 Feb 2021 22:34:00 +0000 (14:34 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 22 Feb 2021 22:34:00 +0000 (14:34 -0800)
Pull powerpc updates from Michael Ellerman:

 - A large series adding wrappers for our interrupt handlers, so that
   irq/nmi/user tracking can be isolated in the wrappers rather than
   spread in each handler.

 - Conversion of the 32-bit syscall handling into C.

 - A series from Nick to streamline our TLB flushing when using the
   Radix MMU.

 - Switch to using queued spinlocks by default for 64-bit server CPUs.

 - A rework of our PCI probing so that it happens later in boot, when
   more generic infrastructure is available.

 - Two small fixes to allow 32-bit little-endian processes to run on
   64-bit kernels.

 - Other smaller features, fixes & cleanups.

Thanks to: Alexey Kardashevskiy, Ananth N Mavinakayanahalli, Aneesh
Kumar K.V, Athira Rajeev, Bhaskar Chowdhury, Cédric Le Goater, Chengyang
Fan, Christophe Leroy, Christopher M. Riedl, Fabiano Rosas, Florian
Fainelli, Frederic Barrat, Ganesh Goudar, Hari Bathini, Jiapeng Chong,
Joseph J Allen, Kajol Jain, Markus Elfring, Michal Suchanek, Nathan
Lynch, Naveen N. Rao, Nicholas Piggin, Oliver O'Halloran, Pingfan Liu,
Po-Hsu Lin, Qian Cai, Ram Pai, Randy Dunlap, Sandipan Das, Stephen
Rothwell, Tyrel Datwyler, Will Springer, Yury Norov, and Zheng Yongjun.

* tag 'powerpc-5.12-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (188 commits)
  powerpc/perf: Adds support for programming of Thresholding in P10
  powerpc/pci: Remove unimplemented prototypes
  powerpc/uaccess: Merge raw_copy_to_user_allowed() into raw_copy_to_user()
  powerpc/uaccess: Merge __put_user_size_allowed() into __put_user_size()
  powerpc/uaccess: get rid of small constant size cases in raw_copy_{to,from}_user()
  powerpc/64: Fix stack trace not displaying final frame
  powerpc/time: Remove get_tbl()
  powerpc/time: Avoid using get_tbl()
  spi: mpc52xx: Avoid using get_tbl()
  powerpc/syscall: Avoid storing 'current' in another pointer
  powerpc/32: Handle bookE debugging in C in syscall entry/exit
  powerpc/syscall: Do not check unsupported scv vector on PPC32
  powerpc/32: Remove the counter in global_dbcr0
  powerpc/32: Remove verification of MSR_PR on syscall in the ASM entry
  powerpc/syscall: implement system call entry/exit logic in C for PPC32
  powerpc/32: Always save non volatile GPRs at syscall entry
  powerpc/syscall: Change condition to check MSR_RI
  powerpc/syscall: Save r3 in regs->orig_r3
  powerpc/syscall: Use is_compat_task()
  powerpc/syscall: Make interrupt.c buildable on PPC32
  ...

194 files changed:
arch/powerpc/Kconfig
arch/powerpc/Kconfig.debug
arch/powerpc/configs/44x/akebono_defconfig
arch/powerpc/include/asm/asm-prototypes.h
arch/powerpc/include/asm/book3s/32/kup.h
arch/powerpc/include/asm/book3s/32/mmu-hash.h
arch/powerpc/include/asm/book3s/64/kup.h
arch/powerpc/include/asm/book3s/64/mmu-hash.h
arch/powerpc/include/asm/book3s/64/mmu.h
arch/powerpc/include/asm/book3s/64/pgtable.h
arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
arch/powerpc/include/asm/book3s/64/tlbflush.h
arch/powerpc/include/asm/bug.h
arch/powerpc/include/asm/cacheflush.h
arch/powerpc/include/asm/cputime.h
arch/powerpc/include/asm/debug.h
arch/powerpc/include/asm/firmware.h
arch/powerpc/include/asm/hugetlb.h
arch/powerpc/include/asm/hw_irq.h
arch/powerpc/include/asm/interrupt.h [new file with mode: 0644]
arch/powerpc/include/asm/kexec.h
arch/powerpc/include/asm/kup.h
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/include/asm/machdep.h
arch/powerpc/include/asm/mce.h
arch/powerpc/include/asm/mmu_context.h
arch/powerpc/include/asm/nmi.h
arch/powerpc/include/asm/paca.h
arch/powerpc/include/asm/paravirt.h
arch/powerpc/include/asm/perf_event.h
arch/powerpc/include/asm/perf_event_server.h
arch/powerpc/include/asm/pgtable.h
arch/powerpc/include/asm/pkeys.h
arch/powerpc/include/asm/ppc-pci.h
arch/powerpc/include/asm/ppc_asm.h
arch/powerpc/include/asm/ptrace.h
arch/powerpc/include/asm/reg.h
arch/powerpc/include/asm/reg_booke.h
arch/powerpc/include/asm/rtas.h
arch/powerpc/include/asm/setup.h
arch/powerpc/include/asm/simple_spinlock.h
arch/powerpc/include/asm/smp.h
arch/powerpc/include/asm/thread_info.h
arch/powerpc/include/asm/time.h
arch/powerpc/include/asm/uaccess.h
arch/powerpc/include/asm/vdso/timebase.h
arch/powerpc/include/asm/xmon.h
arch/powerpc/include/uapi/asm/perf_regs.h
arch/powerpc/kernel/Makefile
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/dbell.c
arch/powerpc/kernel/eeh.c
arch/powerpc/kernel/entry_32.S
arch/powerpc/kernel/entry_64.S
arch/powerpc/kernel/exceptions-64e.S
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/head_32.h
arch/powerpc/kernel/head_40x.S
arch/powerpc/kernel/head_44x.S
arch/powerpc/kernel/head_8xx.S
arch/powerpc/kernel/head_book3s_32.S
arch/powerpc/kernel/head_booke.h
arch/powerpc/kernel/head_fsl_booke.S
arch/powerpc/kernel/idle_book3s.S
arch/powerpc/kernel/interrupt.c [new file with mode: 0644]
arch/powerpc/kernel/iommu.c
arch/powerpc/kernel/irq.c
arch/powerpc/kernel/mce.c
arch/powerpc/kernel/optprobes.c
arch/powerpc/kernel/pci-common.c
arch/powerpc/kernel/pci_dn.c
arch/powerpc/kernel/process.c
arch/powerpc/kernel/prom.c
arch/powerpc/kernel/prom_init.c
arch/powerpc/kernel/ptrace/ptrace.c
arch/powerpc/kernel/setup-common.c
arch/powerpc/kernel/setup.h
arch/powerpc/kernel/setup_64.c
arch/powerpc/kernel/signal.c
arch/powerpc/kernel/signal_32.c
arch/powerpc/kernel/smp.c
arch/powerpc/kernel/sys_ppc32.c
arch/powerpc/kernel/syscall_64.c [deleted file]
arch/powerpc/kernel/syscalls/syscall.tbl
arch/powerpc/kernel/tau_6xx.c
arch/powerpc/kernel/time.c
arch/powerpc/kernel/traps.c
arch/powerpc/kernel/watchdog.c
arch/powerpc/kexec/elf_64.c
arch/powerpc/kexec/file_load_64.c
arch/powerpc/kvm/Kconfig
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_emulate.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_builtin.c
arch/powerpc/kvm/book3s_xive.c
arch/powerpc/kvm/booke.c
arch/powerpc/kvm/powerpc.c
arch/powerpc/lib/pmem.c
arch/powerpc/lib/sstep.c
arch/powerpc/mm/book3s32/Makefile
arch/powerpc/mm/book3s32/mmu.c
arch/powerpc/mm/book3s64/hash_hugetlbpage.c
arch/powerpc/mm/book3s64/hash_utils.c
arch/powerpc/mm/book3s64/internal.h
arch/powerpc/mm/book3s64/iommu_api.c
arch/powerpc/mm/book3s64/pgtable.c
arch/powerpc/mm/book3s64/radix_tlb.c
arch/powerpc/mm/book3s64/slb.c
arch/powerpc/mm/fault.c
arch/powerpc/mm/hugetlbpage.c
arch/powerpc/mm/mem.c
arch/powerpc/mm/pgtable.c
arch/powerpc/mm/ptdump/segment_regs.c
arch/powerpc/perf/core-book3s.c
arch/powerpc/perf/core-fsl-emb.c
arch/powerpc/perf/hv-24x7.c
arch/powerpc/perf/isa207-common.c
arch/powerpc/perf/isa207-common.h
arch/powerpc/perf/mpc7450-pmu.c
arch/powerpc/perf/perf_regs.c
arch/powerpc/perf/power10-pmu.c
arch/powerpc/perf/power5+-pmu.c
arch/powerpc/perf/power5-pmu.c
arch/powerpc/perf/power6-pmu.c
arch/powerpc/perf/power7-pmu.c
arch/powerpc/perf/ppc970-pmu.c
arch/powerpc/platforms/44x/Kconfig
arch/powerpc/platforms/512x/mpc5121_ads.c
arch/powerpc/platforms/52xx/efika.c
arch/powerpc/platforms/52xx/lite5200.c
arch/powerpc/platforms/52xx/media5200.c
arch/powerpc/platforms/52xx/mpc5200_simple.c
arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c
arch/powerpc/platforms/82xx/mpc8272_ads.c
arch/powerpc/platforms/82xx/pq2ads-pci-pic.c
arch/powerpc/platforms/82xx/pq2fads.c
arch/powerpc/platforms/83xx/asp834x.c
arch/powerpc/platforms/83xx/km83xx.c
arch/powerpc/platforms/83xx/misc.c
arch/powerpc/platforms/83xx/mpc830x_rdb.c
arch/powerpc/platforms/83xx/mpc831x_rdb.c
arch/powerpc/platforms/83xx/mpc832x_mds.c
arch/powerpc/platforms/83xx/mpc832x_rdb.c
arch/powerpc/platforms/83xx/mpc834x_itx.c
arch/powerpc/platforms/83xx/mpc834x_mds.c
arch/powerpc/platforms/83xx/mpc836x_mds.c
arch/powerpc/platforms/83xx/mpc836x_rdk.c
arch/powerpc/platforms/83xx/mpc837x_mds.c
arch/powerpc/platforms/83xx/mpc837x_rdb.c
arch/powerpc/platforms/83xx/mpc83xx.h
arch/powerpc/platforms/8xx/machine_check.c
arch/powerpc/platforms/amigaone/setup.c
arch/powerpc/platforms/cell/pervasive.c
arch/powerpc/platforms/cell/pervasive.h
arch/powerpc/platforms/cell/ras.c
arch/powerpc/platforms/cell/ras.h
arch/powerpc/platforms/chrp/pci.c
arch/powerpc/platforms/chrp/setup.c
arch/powerpc/platforms/embedded6xx/holly.c
arch/powerpc/platforms/embedded6xx/linkstation.c
arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c
arch/powerpc/platforms/embedded6xx/mvme5100.c
arch/powerpc/platforms/embedded6xx/storcenter.c
arch/powerpc/platforms/maple/pci.c
arch/powerpc/platforms/maple/setup.c
arch/powerpc/platforms/pasemi/setup.c
arch/powerpc/platforms/powermac/pci.c
arch/powerpc/platforms/powermac/setup.c
arch/powerpc/platforms/powernv/idle.c
arch/powerpc/platforms/powernv/memtrace.c
arch/powerpc/platforms/powernv/opal.c
arch/powerpc/platforms/powernv/pci-ioda-tce.c
arch/powerpc/platforms/powernv/pci-ioda.c
arch/powerpc/platforms/powernv/pci.h
arch/powerpc/platforms/powernv/setup.c
arch/powerpc/platforms/powernv/subcore.h
arch/powerpc/platforms/powernv/vas.c
arch/powerpc/platforms/powernv/vas.h
arch/powerpc/platforms/pseries/dlpar.c
arch/powerpc/platforms/pseries/eeh_pseries.c
arch/powerpc/platforms/pseries/pci.c
arch/powerpc/platforms/pseries/pseries.h
arch/powerpc/platforms/pseries/ras.c
arch/powerpc/platforms/pseries/setup.c
arch/powerpc/xmon/xmon.c
drivers/misc/cxl/cxllib.c
drivers/misc/ocxl/file.c
drivers/spi/spi-mpc52xx.c
tools/testing/selftests/powerpc/eeh/eeh-basic.sh
tools/testing/selftests/powerpc/eeh/eeh-functions.sh [changed mode: 0755->0644]
tools/testing/selftests/powerpc/eeh/eeh-vf-aware.sh [new file with mode: 0755]
tools/testing/selftests/powerpc/eeh/eeh-vf-unaware.sh [new file with mode: 0755]

index 9141f03..2ffb229 100644 (file)
@@ -196,7 +196,6 @@ config PPC
        select HAVE_STACKPROTECTOR              if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13)
        select HAVE_STACKPROTECTOR              if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2)
        select HAVE_CONTEXT_TRACKING            if PPC64
-       select HAVE_TIF_NOHZ                    if PPC64
        select HAVE_DEBUG_KMEMLEAK
        select HAVE_DEBUG_STACKOVERFLOW
        select HAVE_DYNAMIC_FTRACE
@@ -503,18 +502,14 @@ config HOTPLUG_CPU
          Say N if you are unsure.
 
 config PPC_QUEUED_SPINLOCKS
-       bool "Queued spinlocks"
+       bool "Queued spinlocks" if EXPERT
        depends on SMP
+       default PPC_BOOK3S_64
        help
          Say Y here to use queued spinlocks which give better scalability and
          fairness on large SMP and NUMA systems without harming single threaded
          performance.
 
-         This option is currently experimental, the code is more complex and
-         less tested so it defaults to "N" for the moment.
-
-         If unsure, say "N".
-
 config ARCH_CPU_PROBE_RELEASE
        def_bool y
        depends on HOTPLUG_CPU
@@ -718,18 +713,6 @@ config ARCH_MEMORY_PROBE
        def_bool y
        depends on MEMORY_HOTPLUG
 
-config STDBINUTILS
-       bool "Using standard binutils settings"
-       depends on 44x
-       default y
-       help
-         Turning this option off allows you to select 256KB PAGE_SIZE on 44x.
-         Note, that kernel will be able to run only those applications,
-         which had been compiled using binutils later than 2.17.50.0.3 with
-         '-zmax-page-size' set to 256K (the default is 64K). Or, if using
-         the older binutils, you can patch them with a trivial patch, which
-         changes the ELF_MAXPAGESIZE definition from 0x10000 to 0x40000.
-
 choice
        prompt "Page size"
        default PPC_4K_PAGES
@@ -769,17 +752,15 @@ config PPC_64K_PAGES
        select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64
 
 config PPC_256K_PAGES
-       bool "256k page size"
-       depends on 44x && !STDBINUTILS
+       bool "256k page size (Requires non-standard binutils settings)"
+       depends on 44x && !PPC_47x
        help
          Make the page size 256k.
 
-         As the ELF standard only requires alignment to support page
-         sizes up to 64k, you will need to compile all of your user
-         space applications with a non-standard binutils settings
-         (see the STDBINUTILS description for details).
-
-         Say N unless you know what you are doing.
+         The kernel will only be able to run applications that have been
+         compiled with '-zmax-page-size' set to 256K (the default is 64K) using
+         binutils later than 2.17.50.0.3, or by patching the ELF_MAXPAGESIZE
+         definition from 0x10000 to 0x40000 in older versions.
 
 endchoice
 
index b88900f..ae08435 100644 (file)
@@ -88,6 +88,7 @@ config PPC_IRQ_SOFT_MASK_DEBUG
 config XMON
        bool "Include xmon kernel debugger"
        depends on DEBUG_KERNEL
+       select CONSOLE_POLL if SERIAL_CPM_CONSOLE
        help
          Include in-kernel hooks for the xmon kernel monitor/debugger.
          Unless you are intending to debug the kernel, say N here.
index 72b8f93..4bc549c 100644 (file)
@@ -20,6 +20,7 @@ CONFIG_IRQ_ALL_CPUS=y
 # CONFIG_COMPACTION is not set
 # CONFIG_SUSPEND is not set
 CONFIG_NET=y
+CONFIG_NETDEVICES=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
 CONFIG_INET=y
@@ -40,7 +41,9 @@ CONFIG_BLK_DEV_RAM_SIZE=35000
 # CONFIG_SCSI_PROC_FS is not set
 CONFIG_BLK_DEV_SD=y
 # CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_ATA=y
 # CONFIG_SATA_PMP is not set
+CONFIG_SATA_AHCI_PLATFORM=y
 # CONFIG_ATA_SFF is not set
 # CONFIG_NET_VENDOR_3COM is not set
 # CONFIG_NET_VENDOR_ADAPTEC is not set
@@ -97,6 +100,8 @@ CONFIG_USB_OHCI_HCD=y
 # CONFIG_USB_OHCI_HCD_PCI is not set
 CONFIG_USB_STORAGE=y
 CONFIG_MMC=y
+CONFIG_MMC_SDHCI=y
+CONFIG_MMC_SDHCI_PLTFM=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_M41T80=y
 CONFIG_EXT2_FS=y
index d0b832c..939f3c9 100644 (file)
@@ -56,35 +56,6 @@ int exit_vmx_usercopy(void);
 int enter_vmx_ops(void);
 void *exit_vmx_ops(void *dest);
 
-/* Traps */
-long machine_check_early(struct pt_regs *regs);
-long hmi_exception_realmode(struct pt_regs *regs);
-void SMIException(struct pt_regs *regs);
-void handle_hmi_exception(struct pt_regs *regs);
-void instruction_breakpoint_exception(struct pt_regs *regs);
-void RunModeException(struct pt_regs *regs);
-void single_step_exception(struct pt_regs *regs);
-void program_check_exception(struct pt_regs *regs);
-void alignment_exception(struct pt_regs *regs);
-void StackOverflow(struct pt_regs *regs);
-void stack_overflow_exception(struct pt_regs *regs);
-void kernel_fp_unavailable_exception(struct pt_regs *regs);
-void altivec_unavailable_exception(struct pt_regs *regs);
-void vsx_unavailable_exception(struct pt_regs *regs);
-void fp_unavailable_tm(struct pt_regs *regs);
-void altivec_unavailable_tm(struct pt_regs *regs);
-void vsx_unavailable_tm(struct pt_regs *regs);
-void facility_unavailable_exception(struct pt_regs *regs);
-void TAUException(struct pt_regs *regs);
-void altivec_assist_exception(struct pt_regs *regs);
-void unrecoverable_exception(struct pt_regs *regs);
-void kernel_bad_stack(struct pt_regs *regs);
-void system_reset_exception(struct pt_regs *regs);
-void machine_check_exception(struct pt_regs *regs);
-void emulation_assist_interrupt(struct pt_regs *regs);
-long do_slb_fault(struct pt_regs *regs, unsigned long ea);
-void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err);
-
 /* signals, syscalls and interrupts */
 long sys_swapcontext(struct ucontext __user *old_ctx,
                    struct ucontext __user *new_ctx,
index a0117a9..73bc5d2 100644 (file)
@@ -95,12 +95,12 @@ static inline void kuap_update_sr(u32 sr, u32 addr, u32 end)
        addr &= 0xf0000000;     /* align addr to start of segment */
        barrier();      /* make sure thread.kuap is updated before playing with SRs */
        while (addr < end) {
-               mtsrin(sr, addr);
+               mtsr(sr, addr);
                sr += 0x111;            /* next VSID */
                sr &= 0xf0ffffff;       /* clear VSID overflow */
                addr += 0x10000000;     /* address of next segment */
        }
-       isync();        /* Context sync required after mtsrin() */
+       isync();        /* Context sync required after mtsr() */
 }
 
 static __always_inline void allow_user_access(void __user *to, const void __user *from,
@@ -122,7 +122,7 @@ static __always_inline void allow_user_access(void __user *to, const void __user
        end = min(addr + size, TASK_SIZE);
 
        current->thread.kuap = (addr & 0xf0000000) | ((((end - 1) >> 28) + 1) & 0xf);
-       kuap_update_sr(mfsrin(addr) & ~SR_KS, addr, end);       /* Clear Ks */
+       kuap_update_sr(mfsr(addr) & ~SR_KS, addr, end); /* Clear Ks */
 }
 
 static __always_inline void prevent_user_access(void __user *to, const void __user *from,
@@ -151,7 +151,7 @@ static __always_inline void prevent_user_access(void __user *to, const void __us
        }
 
        current->thread.kuap = 0;
-       kuap_update_sr(mfsrin(addr) | SR_KS, addr, end);        /* set Ks */
+       kuap_update_sr(mfsr(addr) | SR_KS, addr, end);  /* set Ks */
 }
 
 static inline unsigned long prevent_user_access_return(void)
index 685c589..b85f8e1 100644 (file)
@@ -94,7 +94,7 @@ typedef struct {
 } mm_context_t;
 
 void update_bats(void);
-static inline void cleanup_cpu_mmu_context(void) { };
+static inline void cleanup_cpu_mmu_context(void) { }
 
 /* patch sites */
 extern s32 patch__hash_page_A0, patch__hash_page_A1, patch__hash_page_A2;
index 7d1ef7b..8bd9050 100644 (file)
@@ -339,7 +339,7 @@ static inline unsigned long get_kuap(void)
         * This has no effect in terms of actually blocking things on hash,
         * so it doesn't break anything.
         */
-       if (!early_mmu_has_feature(MMU_FTR_BOOK3S_KUAP))
+       if (!mmu_has_feature(MMU_FTR_BOOK3S_KUAP))
                return AMR_KUAP_BLOCKED;
 
        return mfspr(SPRN_AMR);
@@ -347,7 +347,7 @@ static inline unsigned long get_kuap(void)
 
 static inline void set_kuap(unsigned long value)
 {
-       if (!early_mmu_has_feature(MMU_FTR_BOOK3S_KUAP))
+       if (!mmu_has_feature(MMU_FTR_BOOK3S_KUAP))
                return;
 
        /*
index 066b1d3..f911bdb 100644 (file)
@@ -454,6 +454,8 @@ static inline unsigned long hpt_hash(unsigned long vpn,
 #define HPTE_NOHPTE_UPDATE     0x2
 #define HPTE_USE_KERNEL_KEY    0x4
 
+long hpte_insert_repeating(unsigned long hash, unsigned long vpn, unsigned long pa,
+                          unsigned long rlags, unsigned long vflags, int psize, int ssize);
 extern int __hash_page_4K(unsigned long ea, unsigned long access,
                          unsigned long vsid, pte_t *ptep, unsigned long trap,
                          unsigned long flags, int ssize, int subpage_prot);
@@ -467,6 +469,8 @@ extern int hash_page_mm(struct mm_struct *mm, unsigned long ea,
                        unsigned long flags);
 extern int hash_page(unsigned long ea, unsigned long access, unsigned long trap,
                     unsigned long dsisr);
+void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc);
+int __hash_page(unsigned long trap, unsigned long ea, unsigned long dsisr, unsigned long msr);
 int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
                     pte_t *ptep, unsigned long trap, unsigned long flags,
                     int ssize, unsigned int shift, unsigned int mmu_psize);
@@ -521,6 +525,7 @@ void slb_dump_contents(struct slb_entry *slb_ptr);
 
 extern void slb_vmalloc_update(void);
 extern void slb_set_size(u16 size);
+void preload_new_slb_context(unsigned long start, unsigned long sp);
 #endif /* __ASSEMBLY__ */
 
 /*
index 995bbcd..eace8c3 100644 (file)
@@ -239,7 +239,7 @@ static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 #ifdef CONFIG_PPC_PSERIES
 extern void radix_init_pseries(void);
 #else
-static inline void radix_init_pseries(void) { };
+static inline void radix_init_pseries(void) { }
 #endif
 
 #ifdef CONFIG_HOTPLUG_CPU
index a398866..058601e 100644 (file)
@@ -388,11 +388,28 @@ static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
 #define ptep_test_and_clear_young(__vma, __addr, __ptep)       \
 ({                                                             \
-       int __r;                                                \
-       __r = __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep); \
-       __r;                                                    \
+       __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep); \
 })
 
+/*
+ * On Book3S CPUs, clearing the accessed bit without a TLB flush
+ * doesn't cause data corruption. [ It could cause incorrect
+ * page aging and the (mistaken) reclaim of hot pages, but the
+ * chance of that should be relatively low. ]
+ *
+ * So as a performance optimization don't flush the TLB when
+ * clearing the accessed bit, it will eventually be flushed by
+ * a context switch or a VM operation anyway. [ In the rare
+ * event of it not getting flushed for a long time the delay
+ * shouldn't really matter because there's no real memory
+ * pressure for swapout to react to. ]
+ */
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+#define ptep_clear_flush_young ptep_test_and_clear_young
+
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+#define pmdp_clear_flush_young pmdp_test_and_clear_young
+
 static inline int __pte_write(pte_t pte)
 {
        return !!(pte_raw(pte) & cpu_to_be64(_PAGE_WRITE));
index 94439e0..8b33601 100644 (file)
@@ -35,7 +35,7 @@ extern void radix__flush_pwc_lpid(unsigned int lpid);
 extern void radix__flush_all_lpid(unsigned int lpid);
 extern void radix__flush_all_lpid_guest(unsigned int lpid);
 #else
-static inline void radix__tlbiel_all(unsigned int action) { WARN_ON(1); };
+static inline void radix__tlbiel_all(unsigned int action) { WARN_ON(1); }
 static inline void radix__flush_tlb_lpid_page(unsigned int lpid,
                                        unsigned long addr,
                                        unsigned long page_size)
index dcb5c38..215973b 100644 (file)
@@ -31,7 +31,7 @@ static inline void tlbiel_all(void)
                hash__tlbiel_all(TLB_INVAL_SCOPE_GLOBAL);
 }
 #else
-static inline void tlbiel_all(void) { BUG(); };
+static inline void tlbiel_all(void) { BUG(); }
 #endif
 
 static inline void tlbiel_all_lpid(bool radix)
index 464f8ca..d1635ff 100644 (file)
 #ifndef __ASSEMBLY__
 
 struct pt_regs;
-extern int do_page_fault(struct pt_regs *, unsigned long, unsigned long);
-extern void bad_page_fault(struct pt_regs *, unsigned long, int);
-void __bad_page_fault(struct pt_regs *regs, unsigned long address, int sig);
+long do_page_fault(struct pt_regs *);
+long hash__do_page_fault(struct pt_regs *);
+void bad_page_fault(struct pt_regs *, int);
+void __bad_page_fault(struct pt_regs *regs, int sig);
+void do_bad_page_fault_segv(struct pt_regs *regs);
 extern void _exception(int, struct pt_regs *, int, unsigned long);
 extern void _exception_pkey(struct pt_regs *, unsigned long, int);
 extern void die(const char *, struct pt_regs *, long);
+void die_mce(const char *str, struct pt_regs *regs, long err);
 extern bool die_will_crash(void);
 extern void panic_flush_kmsg_start(void);
 extern void panic_flush_kmsg_end(void);
index 138e46d..f634951 100644 (file)
@@ -8,6 +8,12 @@
 #include <asm/cputable.h>
 #include <asm/cpu_has_feature.h>
 
+/*
+ * This flag is used to indicate that the page pointed to by a pte is clean
+ * and does not require cleaning before returning it to the user.
+ */
+#define PG_dcache_clean PG_arch_1
+
 #ifdef CONFIG_PPC_BOOK3S_64
 /*
  * Book3s has no ptesync after setting a pte, so without this ptesync it's
index ed75d1c..504f7fe 100644 (file)
@@ -87,6 +87,17 @@ static notrace inline void account_cpu_user_exit(void)
        acct->starttime_user = tb;
 }
 
+static notrace inline void account_stolen_time(void)
+{
+#ifdef CONFIG_PPC_SPLPAR
+       if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+               struct lppaca *lp = local_paca->lppaca_ptr;
+
+               if (unlikely(local_paca->dtl_ridx != be64_to_cpu(lp->dtl_idx)))
+                       accumulate_stolen_time();
+       }
+#endif
+}
 
 #endif /* __KERNEL__ */
 #else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
@@ -96,5 +107,8 @@ static inline void account_cpu_user_entry(void)
 static inline void account_cpu_user_exit(void)
 {
 }
+static notrace inline void account_stolen_time(void)
+{
+}
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 #endif /* __POWERPC_CPUTIME_H */
index ec57daf..86a1473 100644 (file)
@@ -50,10 +50,6 @@ bool ppc_breakpoint_available(void);
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
 extern void do_send_trap(struct pt_regs *regs, unsigned long address,
                         unsigned long error_code, int brkpt);
-#else
-
-extern void do_break(struct pt_regs *regs, unsigned long address,
-                    unsigned long error_code);
 #endif
 
 #endif /* _ASM_POWERPC_DEBUG_H */
index aa6a5ef..7604673 100644 (file)
@@ -137,7 +137,7 @@ extern unsigned int __start___fw_ftr_fixup, __stop___fw_ftr_fixup;
 #ifdef CONFIG_PPC_PSERIES
 void pseries_probe_fw_features(void);
 #else
-static inline void pseries_probe_fw_features(void) { };
+static inline void pseries_probe_fw_features(void) { }
 #endif
 
 #endif /* __ASSEMBLY__ */
index 013165e..f18c543 100644 (file)
@@ -17,8 +17,6 @@ extern bool hugetlb_disabled;
 
 void hugetlbpage_init_default(void);
 
-void flush_dcache_icache_hugepage(struct page *page);
-
 int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
                           unsigned long len);
 
index 0363734..56a9893 100644 (file)
@@ -38,6 +38,8 @@
 #define PACA_IRQ_MUST_HARD_MASK        (PACA_IRQ_EE)
 #endif
 
+#endif /* CONFIG_PPC64 */
+
 /*
  * flags for paca->irq_soft_mask
  */
 #define IRQS_PMI_DISABLED      2
 #define IRQS_ALL_DISABLED      (IRQS_DISABLED | IRQS_PMI_DISABLED)
 
-#endif /* CONFIG_PPC64 */
-
 #ifndef __ASSEMBLY__
 
-extern void replay_system_reset(void);
-extern void replay_soft_interrupts(void);
+static inline void __hard_irq_enable(void)
+{
+       if (IS_ENABLED(CONFIG_BOOKE) || IS_ENABLED(CONFIG_40x))
+               wrtee(MSR_EE);
+       else if (IS_ENABLED(CONFIG_PPC_8xx))
+               wrtspr(SPRN_EIE);
+       else if (IS_ENABLED(CONFIG_PPC_BOOK3S_64))
+               __mtmsrd(MSR_EE | MSR_RI, 1);
+       else
+               mtmsr(mfmsr() | MSR_EE);
+}
 
-extern void timer_interrupt(struct pt_regs *);
-extern void timer_broadcast_interrupt(void);
-extern void performance_monitor_exception(struct pt_regs *regs);
-extern void WatchdogException(struct pt_regs *regs);
-extern void unknown_exception(struct pt_regs *regs);
+static inline void __hard_irq_disable(void)
+{
+       if (IS_ENABLED(CONFIG_BOOKE) || IS_ENABLED(CONFIG_40x))
+               wrtee(0);
+       else if (IS_ENABLED(CONFIG_PPC_8xx))
+               wrtspr(SPRN_EID);
+       else if (IS_ENABLED(CONFIG_PPC_BOOK3S_64))
+               __mtmsrd(MSR_RI, 1);
+       else
+               mtmsr(mfmsr() & ~MSR_EE);
+}
+
+static inline void __hard_EE_RI_disable(void)
+{
+       if (IS_ENABLED(CONFIG_BOOKE) || IS_ENABLED(CONFIG_40x))
+               wrtee(0);
+       else if (IS_ENABLED(CONFIG_PPC_8xx))
+               wrtspr(SPRN_NRI);
+       else if (IS_ENABLED(CONFIG_PPC_BOOK3S_64))
+               __mtmsrd(0, 1);
+       else
+               mtmsr(mfmsr() & ~(MSR_EE | MSR_RI));
+}
+
+static inline void __hard_RI_enable(void)
+{
+       if (IS_ENABLED(CONFIG_BOOKE) || IS_ENABLED(CONFIG_40x))
+               return;
+
+       if (IS_ENABLED(CONFIG_PPC_8xx))
+               wrtspr(SPRN_EID);
+       else if (IS_ENABLED(CONFIG_PPC_BOOK3S_64))
+               __mtmsrd(MSR_RI, 1);
+       else
+               mtmsr(mfmsr() | MSR_RI);
+}
 
 #ifdef CONFIG_PPC64
 #include <asm/paca.h>
@@ -221,18 +261,6 @@ static inline bool arch_irqs_disabled(void)
 
 #endif /* CONFIG_PPC_BOOK3S */
 
-#ifdef CONFIG_PPC_BOOK3E
-#define __hard_irq_enable()    wrtee(MSR_EE)
-#define __hard_irq_disable()   wrtee(0)
-#define __hard_EE_RI_disable() wrtee(0)
-#define __hard_RI_enable()     do { } while (0)
-#else
-#define __hard_irq_enable()    __mtmsrd(MSR_EE|MSR_RI, 1)
-#define __hard_irq_disable()   __mtmsrd(MSR_RI, 1)
-#define __hard_EE_RI_disable() __mtmsrd(0, 1)
-#define __hard_RI_enable()     __mtmsrd(MSR_RI, 1)
-#endif
-
 #define hard_irq_disable()     do {                                    \
        unsigned long flags;                                            \
        __hard_irq_disable();                                           \
@@ -296,8 +324,17 @@ extern void irq_set_pending_from_srr1(unsigned long srr1);
 
 extern void force_external_irq_replay(void);
 
+static inline void irq_soft_mask_regs_set_state(struct pt_regs *regs, unsigned long val)
+{
+       regs->softe = val;
+}
 #else /* CONFIG_PPC64 */
 
+static inline notrace unsigned long irq_soft_mask_return(void)
+{
+       return 0;
+}
+
 static inline unsigned long arch_local_save_flags(void)
 {
        return mfmsr();
@@ -327,22 +364,12 @@ static inline unsigned long arch_local_irq_save(void)
 
 static inline void arch_local_irq_disable(void)
 {
-       if (IS_ENABLED(CONFIG_BOOKE))
-               wrtee(0);
-       else if (IS_ENABLED(CONFIG_PPC_8xx))
-               wrtspr(SPRN_EID);
-       else
-               mtmsr(mfmsr() & ~MSR_EE);
+       __hard_irq_disable();
 }
 
 static inline void arch_local_irq_enable(void)
 {
-       if (IS_ENABLED(CONFIG_BOOKE))
-               wrtee(MSR_EE);
-       else if (IS_ENABLED(CONFIG_PPC_8xx))
-               wrtspr(SPRN_EIE);
-       else
-               mtmsr(mfmsr() | MSR_EE);
+       __hard_irq_enable();
 }
 
 static inline bool arch_irqs_disabled_flags(unsigned long flags)
@@ -364,6 +391,9 @@ static inline bool arch_irq_disabled_regs(struct pt_regs *regs)
 
 static inline void may_hard_irq_enable(void) { }
 
+static inline void irq_soft_mask_regs_set_state(struct pt_regs *regs, unsigned long val)
+{
+}
 #endif /* CONFIG_PPC64 */
 
 #define ARCH_IRQ_INIT_FLAGS    IRQ_NOREQUEST
diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
new file mode 100644 (file)
index 0000000..aedfba2
--- /dev/null
@@ -0,0 +1,449 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _ASM_POWERPC_INTERRUPT_H
+#define _ASM_POWERPC_INTERRUPT_H
+
+#include <linux/context_tracking.h>
+#include <linux/hardirq.h>
+#include <asm/cputime.h>
+#include <asm/ftrace.h>
+#include <asm/kprobes.h>
+#include <asm/runlatch.h>
+
+struct interrupt_state {
+#ifdef CONFIG_PPC_BOOK3E_64
+       enum ctx_state ctx_state;
+#endif
+};
+
+static inline void booke_restore_dbcr0(void)
+{
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+       unsigned long dbcr0 = current->thread.debug.dbcr0;
+
+       if (IS_ENABLED(CONFIG_PPC32) && unlikely(dbcr0 & DBCR0_IDM)) {
+               mtspr(SPRN_DBSR, -1);
+               mtspr(SPRN_DBCR0, global_dbcr0[smp_processor_id()]);
+       }
+#endif
+}
+
+static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrupt_state *state)
+{
+       /*
+        * Book3E reconciles irq soft mask in asm
+        */
+#ifdef CONFIG_PPC_BOOK3S_64
+       if (irq_soft_mask_set_return(IRQS_ALL_DISABLED) == IRQS_ENABLED)
+               trace_hardirqs_off();
+       local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+
+       if (user_mode(regs)) {
+               CT_WARN_ON(ct_state() != CONTEXT_USER);
+               user_exit_irqoff();
+
+               account_cpu_user_entry();
+               account_stolen_time();
+       } else {
+               /*
+                * CT_WARN_ON comes here via program_check_exception,
+                * so avoid recursion.
+                */
+               if (TRAP(regs) != 0x700)
+                       CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+       }
+#endif
+
+#ifdef CONFIG_PPC_BOOK3E_64
+       state->ctx_state = exception_enter();
+       if (user_mode(regs))
+               account_cpu_user_entry();
+#endif
+}
+
+/*
+ * Care should be taken to note that interrupt_exit_prepare and
+ * interrupt_async_exit_prepare do not necessarily return immediately to
+ * regs context (e.g., if regs is usermode, we don't necessarily return to
+ * user mode). Other interrupts might be taken between here and return,
+ * context switch / preemption may occur in the exit path after this, or a
+ * signal may be delivered, etc.
+ *
+ * The real interrupt exit code is platform specific, e.g.,
+ * interrupt_exit_user_prepare / interrupt_exit_kernel_prepare for 64s.
+ *
+ * However interrupt_nmi_exit_prepare does return directly to regs, because
+ * NMIs do not do "exit work" or replay soft-masked interrupts.
+ */
+static inline void interrupt_exit_prepare(struct pt_regs *regs, struct interrupt_state *state)
+{
+#ifdef CONFIG_PPC_BOOK3E_64
+       exception_exit(state->ctx_state);
+#endif
+
+       /*
+        * Book3S exits to user via interrupt_exit_user_prepare(), which does
+        * context tracking, which is a cleaner way to handle PREEMPT=y
+        * and avoid context entry/exit in e.g., preempt_schedule_irq()),
+        * which is likely to be where the core code wants to end up.
+        *
+        * The above comment explains why we can't do the
+        *
+        *     if (user_mode(regs))
+        *         user_exit_irqoff();
+        *
+        * sequence here.
+        */
+}
+
+static inline void interrupt_async_enter_prepare(struct pt_regs *regs, struct interrupt_state *state)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+       if (cpu_has_feature(CPU_FTR_CTRL) &&
+           !test_thread_local_flags(_TLF_RUNLATCH))
+               __ppc64_runlatch_on();
+#endif
+
+       interrupt_enter_prepare(regs, state);
+       irq_enter();
+}
+
+static inline void interrupt_async_exit_prepare(struct pt_regs *regs, struct interrupt_state *state)
+{
+       irq_exit();
+       interrupt_exit_prepare(regs, state);
+}
+
+struct interrupt_nmi_state {
+#ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3S_64
+       u8 irq_soft_mask;
+       u8 irq_happened;
+#endif
+       u8 ftrace_enabled;
+#endif
+};
+
+static inline void interrupt_nmi_enter_prepare(struct pt_regs *regs, struct interrupt_nmi_state *state)
+{
+#ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3S_64
+       state->irq_soft_mask = local_paca->irq_soft_mask;
+       state->irq_happened = local_paca->irq_happened;
+
+       /*
+        * Set IRQS_ALL_DISABLED unconditionally so irqs_disabled() does
+        * the right thing, and set IRQ_HARD_DIS. We do not want to reconcile
+        * because that goes through irq tracing which we don't want in NMI.
+        */
+       local_paca->irq_soft_mask = IRQS_ALL_DISABLED;
+       local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+
+       /* Don't do any per-CPU operations until interrupt state is fixed */
+#endif
+       /* Allow DEC and PMI to be traced when they are soft-NMI */
+       if (TRAP(regs) != 0x900 && TRAP(regs) != 0xf00 && TRAP(regs) != 0x260) {
+               state->ftrace_enabled = this_cpu_get_ftrace_enabled();
+               this_cpu_set_ftrace_enabled(0);
+       }
+#endif
+
+       /*
+        * Do not use nmi_enter() for pseries hash guest taking a real-mode
+        * NMI because not everything it touches is within the RMA limit.
+        */
+       if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64) ||
+                       !firmware_has_feature(FW_FEATURE_LPAR) ||
+                       radix_enabled() || (mfmsr() & MSR_DR))
+               nmi_enter();
+}
+
+static inline void interrupt_nmi_exit_prepare(struct pt_regs *regs, struct interrupt_nmi_state *state)
+{
+       if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64) ||
+                       !firmware_has_feature(FW_FEATURE_LPAR) ||
+                       radix_enabled() || (mfmsr() & MSR_DR))
+               nmi_exit();
+
+#ifdef CONFIG_PPC64
+       if (TRAP(regs) != 0x900 && TRAP(regs) != 0xf00 && TRAP(regs) != 0x260)
+               this_cpu_set_ftrace_enabled(state->ftrace_enabled);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+       /* Check we didn't change the pending interrupt mask. */
+       WARN_ON_ONCE((state->irq_happened | PACA_IRQ_HARD_DIS) != local_paca->irq_happened);
+       local_paca->irq_happened = state->irq_happened;
+       local_paca->irq_soft_mask = state->irq_soft_mask;
+#endif
+#endif
+}
+
+/*
+ * Don't use noinstr here like x86, but rather add NOKPROBE_SYMBOL to each
+ * function definition. The reason for this is the noinstr section is placed
+ * after the main text section, i.e., very far away from the interrupt entry
+ * asm. That creates problems with fitting linker stubs when building large
+ * kernels.
+ */
+#define interrupt_handler __visible noinline notrace __no_kcsan __no_sanitize_address
+
+/**
+ * DECLARE_INTERRUPT_HANDLER_RAW - Declare raw interrupt handler function
+ * @func:      Function name of the entry point
+ * @returns:   Returns a value back to asm caller
+ */
+#define DECLARE_INTERRUPT_HANDLER_RAW(func)                            \
+       __visible long func(struct pt_regs *regs)
+
+/**
+ * DEFINE_INTERRUPT_HANDLER_RAW - Define raw interrupt handler function
+ * @func:      Function name of the entry point
+ * @returns:   Returns a value back to asm caller
+ *
+ * @func is called from ASM entry code.
+ *
+ * This is a plain function which does no tracing, reconciling, etc.
+ * The macro is written so it acts as function definition. Append the
+ * body with a pair of curly brackets.
+ *
+ * raw interrupt handlers must not enable or disable interrupts, or
+ * schedule, tracing and instrumentation (ftrace, lockdep, etc) would
+ * not be advisable either, although may be possible in a pinch, the
+ * trace will look odd at least.
+ *
+ * A raw handler may call one of the other interrupt handler functions
+ * to be converted into that interrupt context without these restrictions.
+ *
+ * On PPC64, _RAW handlers may return with fast_interrupt_return.
+ *
+ * Specific handlers may have additional restrictions.
+ */
+#define DEFINE_INTERRUPT_HANDLER_RAW(func)                             \
+static __always_inline long ____##func(struct pt_regs *regs);          \
+                                                                       \
+interrupt_handler long func(struct pt_regs *regs)                      \
+{                                                                      \
+       long ret;                                                       \
+                                                                       \
+       ret = ____##func (regs);                                        \
+                                                                       \
+       return ret;                                                     \
+}                                                                      \
+NOKPROBE_SYMBOL(func);                                                 \
+                                                                       \
+static __always_inline long ____##func(struct pt_regs *regs)
+
+/**
+ * DECLARE_INTERRUPT_HANDLER - Declare synchronous interrupt handler function
+ * @func:      Function name of the entry point
+ */
+#define DECLARE_INTERRUPT_HANDLER(func)                                        \
+       __visible void func(struct pt_regs *regs)
+
+/**
+ * DEFINE_INTERRUPT_HANDLER - Define synchronous interrupt handler function
+ * @func:      Function name of the entry point
+ *
+ * @func is called from ASM entry code.
+ *
+ * The macro is written so it acts as function definition. Append the
+ * body with a pair of curly brackets.
+ */
+#define DEFINE_INTERRUPT_HANDLER(func)                                 \
+static __always_inline void ____##func(struct pt_regs *regs);          \
+                                                                       \
+interrupt_handler void func(struct pt_regs *regs)                      \
+{                                                                      \
+       struct interrupt_state state;                                   \
+                                                                       \
+       interrupt_enter_prepare(regs, &state);                          \
+                                                                       \
+       ____##func (regs);                                              \
+                                                                       \
+       interrupt_exit_prepare(regs, &state);                           \
+}                                                                      \
+NOKPROBE_SYMBOL(func);                                                 \
+                                                                       \
+static __always_inline void ____##func(struct pt_regs *regs)
+
+/**
+ * DECLARE_INTERRUPT_HANDLER_RET - Declare synchronous interrupt handler function
+ * @func:      Function name of the entry point
+ * @returns:   Returns a value back to asm caller
+ */
+#define DECLARE_INTERRUPT_HANDLER_RET(func)                            \
+       __visible long func(struct pt_regs *regs)
+
+/**
+ * DEFINE_INTERRUPT_HANDLER_RET - Define synchronous interrupt handler function
+ * @func:      Function name of the entry point
+ * @returns:   Returns a value back to asm caller
+ *
+ * @func is called from ASM entry code.
+ *
+ * The macro is written so it acts as function definition. Append the
+ * body with a pair of curly brackets.
+ */
+#define DEFINE_INTERRUPT_HANDLER_RET(func)                             \
+static __always_inline long ____##func(struct pt_regs *regs);          \
+                                                                       \
+interrupt_handler long func(struct pt_regs *regs)                      \
+{                                                                      \
+       struct interrupt_state state;                                   \
+       long ret;                                                       \
+                                                                       \
+       interrupt_enter_prepare(regs, &state);                          \
+                                                                       \
+       ret = ____##func (regs);                                        \
+                                                                       \
+       interrupt_exit_prepare(regs, &state);                           \
+                                                                       \
+       return ret;                                                     \
+}                                                                      \
+NOKPROBE_SYMBOL(func);                                                 \
+                                                                       \
+static __always_inline long ____##func(struct pt_regs *regs)
+
+/**
+ * DECLARE_INTERRUPT_HANDLER_ASYNC - Declare asynchronous interrupt handler function
+ * @func:      Function name of the entry point
+ */
+#define DECLARE_INTERRUPT_HANDLER_ASYNC(func)                          \
+       __visible void func(struct pt_regs *regs)
+
+/**
+ * DEFINE_INTERRUPT_HANDLER_ASYNC - Define asynchronous interrupt handler function
+ * @func:      Function name of the entry point
+ *
+ * @func is called from ASM entry code.
+ *
+ * The macro is written so it acts as function definition. Append the
+ * body with a pair of curly brackets.
+ */
+#define DEFINE_INTERRUPT_HANDLER_ASYNC(func)                           \
+static __always_inline void ____##func(struct pt_regs *regs);          \
+                                                                       \
+interrupt_handler void func(struct pt_regs *regs)                      \
+{                                                                      \
+       struct interrupt_state state;                                   \
+                                                                       \
+       interrupt_async_enter_prepare(regs, &state);                    \
+                                                                       \
+       ____##func (regs);                                              \
+                                                                       \
+       interrupt_async_exit_prepare(regs, &state);                     \
+}                                                                      \
+NOKPROBE_SYMBOL(func);                                                 \
+                                                                       \
+static __always_inline void ____##func(struct pt_regs *regs)
+
+/**
+ * DECLARE_INTERRUPT_HANDLER_NMI - Declare NMI interrupt handler function
+ * @func:      Function name of the entry point
+ * @returns:   Returns a value back to asm caller
+ */
+#define DECLARE_INTERRUPT_HANDLER_NMI(func)                            \
+       __visible long func(struct pt_regs *regs)
+
+/**
+ * DEFINE_INTERRUPT_HANDLER_NMI - Define NMI interrupt handler function
+ * @func:      Function name of the entry point
+ * @returns:   Returns a value back to asm caller
+ *
+ * @func is called from ASM entry code.
+ *
+ * The macro is written so it acts as function definition. Append the
+ * body with a pair of curly brackets.
+ */
+#define DEFINE_INTERRUPT_HANDLER_NMI(func)                             \
+static __always_inline long ____##func(struct pt_regs *regs);          \
+                                                                       \
+interrupt_handler long func(struct pt_regs *regs)                      \
+{                                                                      \
+       struct interrupt_nmi_state state;                               \
+       long ret;                                                       \
+                                                                       \
+       interrupt_nmi_enter_prepare(regs, &state);                      \
+                                                                       \
+       ret = ____##func (regs);                                        \
+                                                                       \
+       interrupt_nmi_exit_prepare(regs, &state);                       \
+                                                                       \
+       return ret;                                                     \
+}                                                                      \
+NOKPROBE_SYMBOL(func);                                                 \
+                                                                       \
+static __always_inline long ____##func(struct pt_regs *regs)
+
+
+/* Interrupt handlers */
+/* kernel/traps.c */
+DECLARE_INTERRUPT_HANDLER_NMI(system_reset_exception);
+#ifdef CONFIG_PPC_BOOK3S_64
+DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception);
+#else
+DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception);
+#endif
+DECLARE_INTERRUPT_HANDLER(SMIException);
+DECLARE_INTERRUPT_HANDLER(handle_hmi_exception);
+DECLARE_INTERRUPT_HANDLER(unknown_exception);
+DECLARE_INTERRUPT_HANDLER_ASYNC(unknown_async_exception);
+DECLARE_INTERRUPT_HANDLER(instruction_breakpoint_exception);
+DECLARE_INTERRUPT_HANDLER(RunModeException);
+DECLARE_INTERRUPT_HANDLER(single_step_exception);
+DECLARE_INTERRUPT_HANDLER(program_check_exception);
+DECLARE_INTERRUPT_HANDLER(emulation_assist_interrupt);
+DECLARE_INTERRUPT_HANDLER(alignment_exception);
+DECLARE_INTERRUPT_HANDLER(StackOverflow);
+DECLARE_INTERRUPT_HANDLER(stack_overflow_exception);
+DECLARE_INTERRUPT_HANDLER(kernel_fp_unavailable_exception);
+DECLARE_INTERRUPT_HANDLER(altivec_unavailable_exception);
+DECLARE_INTERRUPT_HANDLER(vsx_unavailable_exception);
+DECLARE_INTERRUPT_HANDLER(facility_unavailable_exception);
+DECLARE_INTERRUPT_HANDLER(fp_unavailable_tm);
+DECLARE_INTERRUPT_HANDLER(altivec_unavailable_tm);
+DECLARE_INTERRUPT_HANDLER(vsx_unavailable_tm);
+DECLARE_INTERRUPT_HANDLER_NMI(performance_monitor_exception_nmi);
+DECLARE_INTERRUPT_HANDLER_ASYNC(performance_monitor_exception_async);
+DECLARE_INTERRUPT_HANDLER_RAW(performance_monitor_exception);
+DECLARE_INTERRUPT_HANDLER(DebugException);
+DECLARE_INTERRUPT_HANDLER(altivec_assist_exception);
+DECLARE_INTERRUPT_HANDLER(CacheLockingException);
+DECLARE_INTERRUPT_HANDLER(SPEFloatingPointException);
+DECLARE_INTERRUPT_HANDLER(SPEFloatingPointRoundException);
+DECLARE_INTERRUPT_HANDLER(unrecoverable_exception);
+DECLARE_INTERRUPT_HANDLER(WatchdogException);
+DECLARE_INTERRUPT_HANDLER(kernel_bad_stack);
+
+/* slb.c */
+DECLARE_INTERRUPT_HANDLER_RAW(do_slb_fault);
+DECLARE_INTERRUPT_HANDLER(do_bad_slb_fault);
+
+/* hash_utils.c */
+DECLARE_INTERRUPT_HANDLER_RAW(do_hash_fault);
+
+/* fault.c */
+DECLARE_INTERRUPT_HANDLER_RET(do_page_fault);
+DECLARE_INTERRUPT_HANDLER(do_bad_page_fault_segv);
+
+/* process.c */
+DECLARE_INTERRUPT_HANDLER(do_break);
+
+/* time.c */
+DECLARE_INTERRUPT_HANDLER_ASYNC(timer_interrupt);
+
+/* mce.c */
+DECLARE_INTERRUPT_HANDLER_NMI(machine_check_early);
+DECLARE_INTERRUPT_HANDLER_NMI(hmi_exception_realmode);
+
+DECLARE_INTERRUPT_HANDLER_ASYNC(TAUException);
+
+void replay_system_reset(void);
+void replay_soft_interrupts(void);
+
+static inline void interrupt_cond_local_irq_enable(struct pt_regs *regs)
+{
+       if (!arch_irq_disabled_regs(regs))
+               local_irq_enable();
+}
+
+#endif /* _ASM_POWERPC_INTERRUPT_H */
index 55d6ede..9ab344d 100644 (file)
@@ -136,6 +136,7 @@ int load_crashdump_segments_ppc64(struct kimage *image,
 int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
                          const void *fdt, unsigned long kernel_load_addr,
                          unsigned long fdt_load_addr);
+unsigned int kexec_fdt_totalsize_ppc64(struct kimage *image);
 int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
                        unsigned long initrd_load_addr,
                        unsigned long initrd_len, const char *cmdline);
index bf221a2..7ec21af 100644 (file)
@@ -91,6 +91,7 @@ static __always_inline void setup_kup(void)
 
 static inline void allow_read_from_user(const void __user *from, unsigned long size)
 {
+       barrier_nospec();
        allow_user_access(NULL, from, size, KUAP_READ);
 }
 
@@ -102,6 +103,7 @@ static inline void allow_write_to_user(void __user *to, unsigned long size)
 static inline void allow_read_write_user(void __user *to, const void __user *from,
                                         unsigned long size)
 {
+       barrier_nospec();
        allow_user_access(to, from, size, KUAP_READ_WRITE);
 }
 
index d32ec9a..2f5f919 100644 (file)
@@ -277,6 +277,13 @@ extern int kvmppc_hcall_impl_hv_realmode(unsigned long cmd);
 extern void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu);
 extern void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu);
 
+long kvmppc_read_intr(void);
+void kvmppc_bad_interrupt(struct pt_regs *regs);
+void kvmhv_p9_set_lpcr(struct kvm_split_mode *sip);
+void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip);
+void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr);
+void kvmppc_inject_interrupt_hv(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags);
+
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu);
 void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu);
index df4bda8..8aacd76 100644 (file)
@@ -629,9 +629,9 @@ extern int h_ipi_redirect;
 static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
                                struct kvm *kvm)
        { return NULL; }
-static inline void kvmppc_alloc_host_rm_ops(void) {};
-static inline void kvmppc_free_host_rm_ops(void) {};
-static inline void kvmppc_free_pimap(struct kvm *kvm) {};
+static inline void kvmppc_alloc_host_rm_ops(void) {}
+static inline void kvmppc_free_host_rm_ops(void) {}
+static inline void kvmppc_free_pimap(struct kvm *kvm) {}
 static inline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
        { return 0; }
 static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
@@ -883,9 +883,9 @@ static inline void kvmppc_mmu_flush_icache(kvm_pfn_t pfn)
 
        /* Clear i-cache for new pages */
        page = pfn_to_page(pfn);
-       if (!test_bit(PG_arch_1, &page->flags)) {
+       if (!test_bit(PG_dcache_clean, &page->flags)) {
                flush_dcache_icache_page(page);
-               set_bit(PG_arch_1, &page->flags);
+               set_bit(PG_dcache_clean, &page->flags);
        }
 }
 
index cf6ebbc..764f273 100644 (file)
@@ -59,6 +59,9 @@ struct machdep_calls {
        int             (*pcibios_root_bridge_prepare)(struct pci_host_bridge
                                *bridge);
 
+       /* finds all the pci_controllers present at boot */
+       void            (*discover_phbs)(void);
+
        /* To setup PHBs when using automatic OF platform driver for PCI */
        int             (*pci_setup_phb)(struct pci_controller *host);
 
index e6c27ae..331d944 100644 (file)
@@ -204,7 +204,18 @@ struct mce_error_info {
        bool                    ignore_event;
 };
 
-#define MAX_MC_EVT     100
+#define MAX_MC_EVT     10
+
+struct mce_info {
+       int mce_nest_count;
+       struct machine_check_event mce_event[MAX_MC_EVT];
+       /* Queue for delayed MCE events. */
+       int mce_queue_count;
+       struct machine_check_event mce_event_queue[MAX_MC_EVT];
+       /* Queue for delayed MCE UE events. */
+       int mce_ue_count;
+       struct machine_check_event  mce_ue_event_queue[MAX_MC_EVT];
+};
 
 /* Release flags for get_mce_event() */
 #define MCE_EVENT_RELEASE      true
@@ -234,4 +245,11 @@ long __machine_check_early_realmode_p8(struct pt_regs *regs);
 long __machine_check_early_realmode_p9(struct pt_regs *regs);
 long __machine_check_early_realmode_p10(struct pt_regs *regs);
 #endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+void mce_init(void);
+#else
+static inline void mce_init(void) { };
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
 #endif /* __ASM_PPC64_MCE_H__ */
index d582183..652ce85 100644 (file)
@@ -282,9 +282,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
 }
 
 #define pkey_mm_init(mm)
-#define thread_pkey_regs_save(thread)
-#define thread_pkey_regs_restore(new_thread, old_thread)
-#define thread_pkey_regs_init(thread)
 #define arch_dup_pkeys(oldmm, mm)
 
 static inline u64 pte_to_hpte_pkey_bits(u64 pteflags, unsigned long flags)
index 84b4cfe..160abcb 100644 (file)
@@ -4,6 +4,7 @@
 
 #ifdef CONFIG_PPC_WATCHDOG
 extern void arch_touch_nmi_watchdog(void);
+long soft_nmi_interrupt(struct pt_regs *regs);
 #else
 static inline void arch_touch_nmi_watchdog(void) {}
 #endif
index 9454d29..ec18ac8 100644 (file)
@@ -29,6 +29,7 @@
 #include <asm/hmi.h>
 #include <asm/cpuidle.h>
 #include <asm/atomic.h>
+#include <asm/mce.h>
 
 #include <asm-generic/mmiowb_types.h>
 
@@ -108,8 +109,7 @@ struct paca_struct {
         */
        /* used for most interrupts/exceptions */
        u64 exgen[EX_SIZE] __attribute__((aligned(0x80)));
-       u64 exslb[EX_SIZE];     /* used for SLB/segment table misses
-                                * on the linear mapping */
+
        /* SLB related definitions */
        u16 vmalloc_sllp;
        u8 slb_cache_ptr;
@@ -273,6 +273,9 @@ struct paca_struct {
 #ifdef CONFIG_MMIOWB
        struct mmiowb_state mmiowb_state;
 #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+       struct mce_info *mce_info;
+#endif /* CONFIG_PPC_BOOK3S_64 */
 } ____cacheline_aligned;
 
 extern void copy_mm_to_paca(struct mm_struct *mm);
@@ -285,9 +288,9 @@ extern void free_unused_pacas(void);
 
 #else /* CONFIG_PPC64 */
 
-static inline void allocate_paca_ptrs(void) { };
-static inline void allocate_paca(int cpu) { };
-static inline void free_unused_pacas(void) { };
+static inline void allocate_paca_ptrs(void) { }
+static inline void allocate_paca(int cpu) { }
+static inline void free_unused_pacas(void) { }
 
 #endif /* CONFIG_PPC64 */
 
index edc08f0..5d1726b 100644 (file)
@@ -10,6 +10,7 @@
 #endif
 
 #ifdef CONFIG_PPC_SPLPAR
+#include <linux/smp.h>
 #include <asm/kvm_guest.h>
 #include <asm/cputhreads.h>
 
index daec64d..164e910 100644 (file)
@@ -14,6 +14,7 @@
 #include <asm/perf_event_server.h>
 #else
 static inline bool is_sier_available(void) { return false; }
+static inline unsigned long get_pmcs_ext_regs(int idx) { return 0; }
 #endif
 
 #ifdef CONFIG_FSL_EMB_PERF_EVENT
@@ -40,6 +41,7 @@ static inline bool is_sier_available(void) { return false; }
 
 /* To support perf_regs sier update */
 extern bool is_sier_available(void);
+extern unsigned long get_pmcs_ext_regs(int idx);
 /* To define perf extended regs mask value */
 extern u64 PERF_REG_EXTENDED_MASK;
 #define PERF_REG_EXTENDED_MASK PERF_REG_EXTENDED_MASK
index 3b7baba..00e7e67 100644 (file)
@@ -36,9 +36,9 @@ struct power_pmu {
        unsigned long   test_adder;
        int             (*compute_mmcr)(u64 events[], int n_ev,
                                unsigned int hwc[], struct mmcr_regs *mmcr,
-                               struct perf_event *pevents[]);
+                               struct perf_event *pevents[], u32 flags);
        int             (*get_constraint)(u64 event_id, unsigned long *mskp,
-                               unsigned long *valp);
+                               unsigned long *valp, u64 event_config1);
        int             (*get_alternatives)(u64 event_id, unsigned int flags,
                                u64 alt[]);
        void            (*get_mem_data_src)(union perf_mem_data_src *dsrc,
@@ -83,6 +83,7 @@ struct power_pmu {
 #define PPMU_NO_SIAR           0x00000100 /* Do not use SIAR */
 #define PPMU_ARCH_31           0x00000200 /* Has MMCR3, SIER2 and SIER3 */
 #define PPMU_P10_DD1           0x00000400 /* Is power10 DD1 processor version */
+#define PPMU_HAS_ATTR_CONFIG1  0x00000800 /* Using config1 attribute */
 
 /*
  * Values for flags to get_alternatives()
index f7613f4..4eed821 100644 (file)
@@ -162,6 +162,9 @@ static inline bool is_ioremap_addr(const void *x)
 
        return addr >= IOREMAP_BASE && addr < IOREMAP_END;
 }
+
+struct seq_file;
+void arch_report_meminfo(struct seq_file *m);
 #endif /* CONFIG_PPC64 */
 
 #endif /* __ASSEMBLY__ */
index a795104..59a2c7d 100644 (file)
@@ -169,10 +169,4 @@ static inline bool arch_pkeys_enabled(void)
 }
 
 extern void pkey_mm_init(struct mm_struct *mm);
-extern bool arch_supports_pkeys(int cap);
-extern unsigned int arch_usable_pkeys(void);
-extern void thread_pkey_regs_save(struct thread_struct *thread);
-extern void thread_pkey_regs_restore(struct thread_struct *new_thread,
-                                    struct thread_struct *old_thread);
-extern void thread_pkey_regs_init(struct thread_struct *thread);
 #endif /*_ASM_POWERPC_KEYS_H */
index 7f4be5a..2b9edbf 100644 (file)
 
 extern unsigned long isa_io_base;
 
-extern void pci_setup_phb_io(struct pci_controller *hose, int primary);
-extern void pci_setup_phb_io_dynamic(struct pci_controller *hose, int primary);
-
-
 extern struct list_head hose_list;
 
 extern struct pci_dev *isa_bridge_pcidev;      /* may be NULL if no ISA bus */
@@ -32,9 +28,6 @@ struct pci_dn;
 void *pci_traverse_device_nodes(struct device_node *start,
                                void *(*fn)(struct device_node *, void *),
                                void *data);
-void *traverse_pci_dn(struct pci_dn *root,
-                     void *(*fn)(struct pci_dn *, void *),
-                     void *data);
 extern void pci_devs_phb_init_dynamic(struct pci_controller *phb);
 
 /* From rtas_pci.h */
index cc1bca5..3dceb64 100644 (file)
@@ -25,7 +25,6 @@
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 #define ACCOUNT_CPU_USER_ENTRY(ptr, ra, rb)
 #define ACCOUNT_CPU_USER_EXIT(ptr, ra, rb)
-#define ACCOUNT_STOLEN_TIME
 #else
 #define ACCOUNT_CPU_USER_ENTRY(ptr, ra, rb)                            \
        MFTB(ra);                       /* get timebase */              \
        PPC_LL  ra, ACCOUNT_SYSTEM_TIME(ptr);                           \
        add     ra,ra,rb;               /* add on to system time */     \
        PPC_STL ra, ACCOUNT_SYSTEM_TIME(ptr)
-
-#ifdef CONFIG_PPC_SPLPAR
-#define ACCOUNT_STOLEN_TIME                                            \
-BEGIN_FW_FTR_SECTION;                                                  \
-       beq     33f;                                                    \
-       /* from user - see if there are any DTL entries to process */   \
-       ld      r10,PACALPPACAPTR(r13); /* get ptr to VPA */            \
-       ld      r11,PACA_DTL_RIDX(r13); /* get log read index */        \
-       addi    r10,r10,LPPACA_DTLIDX;                                  \
-       LDX_BE  r10,0,r10;              /* get log write index */       \
-       cmpd    cr1,r11,r10;                                            \
-       beq+    cr1,33f;                                                \
-       bl      accumulate_stolen_time;                         \
-       ld      r12,_MSR(r1);                                           \
-       andi.   r10,r12,MSR_PR;         /* Restore cr0 (coming from user) */ \
-33:                                                                    \
-END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
-
-#else  /* CONFIG_PPC_SPLPAR */
-#define ACCOUNT_STOLEN_TIME
-
-#endif /* CONFIG_PPC_SPLPAR */
-
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 /*
index 58f9dc0..975ba26 100644 (file)
@@ -70,6 +70,9 @@ struct pt_regs
 };
 #endif
 
+
+#define STACK_FRAME_WITH_PT_REGS (STACK_FRAME_OVERHEAD + sizeof(struct pt_regs))
+
 #ifdef __powerpc64__
 
 /*
@@ -229,6 +232,11 @@ static inline bool trap_is_scv(struct pt_regs *regs)
        return (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && TRAP(regs) == 0x3000);
 }
 
+static inline bool trap_is_unsupported_scv(struct pt_regs *regs)
+{
+       return IS_ENABLED(CONFIG_PPC_BOOK3S_64) && TRAP(regs) == 0x7ff0;
+}
+
 static inline bool trap_is_syscall(struct pt_regs *regs)
 {
        return (trap_is_scv(regs) || TRAP(regs) == 0xc00);
index e40a921..da103e9 100644 (file)
 #define mtmsr(v)       asm volatile("mtmsr %0" : \
                                     : "r" ((unsigned long)(v)) \
                                     : "memory")
+#define __mtmsrd(v, l) BUILD_BUG()
 #define __MTMSR                "mtmsr"
 #endif
 
@@ -1413,13 +1414,24 @@ static inline void msr_check_and_clear(unsigned long bits)
 }
 
 #ifdef CONFIG_PPC32
-#define mfsrin(v)      ({unsigned int rval; \
-                       asm volatile("mfsrin %0,%1" : "=r" (rval) : "r" (v)); \
-                                       rval;})
+static inline u32 mfsr(u32 idx)
+{
+       u32 val;
+
+       if (__builtin_constant_p(idx))
+               asm volatile("mfsr %0, %1" : "=r" (val): "i" (idx >> 28));
+       else
+               asm volatile("mfsrin %0, %1" : "=r" (val): "r" (idx));
 
-static inline void mtsrin(u32 val, u32 idx)
+       return val;
+}
+
+static inline void mtsr(u32 val, u32 idx)
 {
-       asm volatile("mtsrin %0, %1" : : "r" (val), "r" (idx));
+       if (__builtin_constant_p(idx))
+               asm volatile("mtsr %1, %0" : : "r" (val), "i" (idx >> 28));
+       else
+               asm volatile("mtsrin %0, %1" : : "r" (val), "r" (idx));
 }
 #endif
 
index 262782f..17b8dcd 100644 (file)
 #define mttmr(rn, v)   asm volatile(MTTMR(rn, %0) : \
                                     : "r" ((unsigned long)(v)) \
                                     : "memory")
+
+extern unsigned long global_dbcr0[];
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __ASM_POWERPC_REG_BOOKE_H__ */
index 332e100..658448c 100644 (file)
@@ -369,7 +369,7 @@ void rtas_initialize(void);
 #else
 static inline int page_is_rtas_user_buf(unsigned long pfn) { return 0;}
 static inline void pSeries_coalesce_init(void) { }
-static inline void rtas_initialize(void) { };
+static inline void rtas_initialize(void) { }
 #endif
 
 extern int call_rtas(const char *, int, int, unsigned long *, ...);
index a466749..e89bfeb 100644 (file)
@@ -58,7 +58,7 @@ void do_rfi_flush_fixups(enum l1d_flush_type types);
 #ifdef CONFIG_PPC_BARRIER_NOSPEC
 void setup_barrier_nospec(void);
 #else
-static inline void setup_barrier_nospec(void) { };
+static inline void setup_barrier_nospec(void) { }
 #endif
 void do_uaccess_flush_fixups(enum l1d_flush_type types);
 void do_entry_flush_fixups(enum l1d_flush_type types);
@@ -68,13 +68,13 @@ extern bool barrier_nospec_enabled;
 #ifdef CONFIG_PPC_BARRIER_NOSPEC
 void do_barrier_nospec_fixups_range(bool enable, void *start, void *end);
 #else
-static inline void do_barrier_nospec_fixups_range(bool enable, void *start, void *end) { };
+static inline void do_barrier_nospec_fixups_range(bool enable, void *start, void *end) { }
 #endif
 
 #ifdef CONFIG_PPC_FSL_BOOK3E
 void setup_spectre_v2(void);
 #else
-static inline void setup_spectre_v2(void) {};
+static inline void setup_spectre_v2(void) {}
 #endif
 void do_btb_flush_fixups(void);
 
index 9c3c305..5b862de 100644 (file)
@@ -90,8 +90,8 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
 void splpar_spin_yield(arch_spinlock_t *lock);
 void splpar_rw_yield(arch_rwlock_t *lock);
 #else /* SPLPAR */
-static inline void splpar_spin_yield(arch_spinlock_t *lock) {};
-static inline void splpar_rw_yield(arch_rwlock_t *lock) {};
+static inline void splpar_spin_yield(arch_spinlock_t *lock) {}
+static inline void splpar_rw_yield(arch_rwlock_t *lock) {}
 #endif
 
 static inline void spin_yield(arch_spinlock_t *lock)
index c4e2d53..7a13bc2 100644 (file)
@@ -236,7 +236,7 @@ static inline void set_hard_smp_processor_id(int cpu, int phys)
 #if defined(CONFIG_PPC64) && (defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE))
 extern void smp_release_cpus(void);
 #else
-static inline void smp_release_cpus(void) { };
+static inline void smp_release_cpus(void) { }
 #endif
 
 extern int smt_enabled_at_boot;
index 3d8a47a..386d576 100644 (file)
@@ -94,7 +94,6 @@ void arch_setup_new_exec(void);
 #define TIF_PATCH_PENDING      6       /* pending live patching update */
 #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
 #define TIF_SINGLESTEP         8       /* singlestepping active */
-#define TIF_NOHZ               9       /* in adaptive nohz mode */
 #define TIF_SECCOMP            10      /* secure computing */
 #define TIF_RESTOREALL         11      /* Restore all regs (implies NOERROR) */
 #define TIF_NOERROR            12      /* Force successful syscall return */
@@ -128,11 +127,10 @@ void arch_setup_new_exec(void);
 #define _TIF_UPROBE            (1<<TIF_UPROBE)
 #define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
 #define _TIF_EMULATE_STACK_STORE       (1<<TIF_EMULATE_STACK_STORE)
-#define _TIF_NOHZ              (1<<TIF_NOHZ)
 #define _TIF_SYSCALL_EMU       (1<<TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_DOTRACE   (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
                                 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
-                                _TIF_NOHZ | _TIF_SYSCALL_EMU)
+                                _TIF_SYSCALL_EMU)
 
 #define _TIF_USER_WORK_MASK    (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
                                 _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
index 8f789b5..8dd3cdb 100644 (file)
@@ -102,6 +102,8 @@ DECLARE_PER_CPU(u64, decrementers_next_tb);
 /* Convert timebase ticks to nanoseconds */
 unsigned long long tb_to_ns(unsigned long long tb_ticks);
 
+void timer_broadcast_interrupt(void);
+
 /* SPLPAR */
 void accumulate_stolen_time(void);
 
index 501c9a7..78e2a39 100644 (file)
@@ -52,8 +52,6 @@ static inline bool __access_ok(unsigned long addr, unsigned long size)
        __get_user_nocheck((x), (ptr), sizeof(*(ptr)), true)
 #define __put_user(x, ptr) \
        __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
-#define __put_user_goto(x, ptr, label) \
-       __put_user_nocheck_goto((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label)
 
 #define __get_user_allowed(x, ptr) \
        __get_user_nocheck((x), (ptr), sizeof(*(ptr)), false)
@@ -110,22 +108,18 @@ static inline bool __access_ok(unsigned long addr, unsigned long size)
 
 extern long __put_user_bad(void);
 
-#define __put_user_size_allowed(x, ptr, size, retval)          \
+#define __put_user_size(x, ptr, size, retval)                  \
 do {                                                           \
        __label__ __pu_failed;                                  \
                                                                \
        retval = 0;                                             \
+       allow_write_to_user(ptr, size);                         \
        __put_user_size_goto(x, ptr, size, __pu_failed);        \
+       prevent_write_to_user(ptr, size);                       \
        break;                                                  \
                                                                \
 __pu_failed:                                                   \
        retval = -EFAULT;                                       \
-} while (0)
-
-#define __put_user_size(x, ptr, size, retval)                  \
-do {                                                           \
-       allow_write_to_user(ptr, size);                         \
-       __put_user_size_allowed(x, ptr, size, retval);          \
        prevent_write_to_user(ptr, size);                       \
 } while (0)
 
@@ -213,11 +207,9 @@ do {                                                               \
        }                                                       \
 } while (0)
 
-#define __put_user_nocheck_goto(x, ptr, size, label)           \
+#define __unsafe_put_user_goto(x, ptr, size, label)            \
 do {                                                           \
        __typeof__(*(ptr)) __user *__pu_addr = (ptr);           \
-       if (!is_kernel_addr((unsigned long)__pu_addr))          \
-               might_fault();                                  \
        __chk_user_ptr(ptr);                                    \
        __put_user_size_goto((x), __pu_addr, (size), label);    \
 } while (0)
@@ -313,9 +305,8 @@ do {                                                                \
        __typeof__(size) __gu_size = (size);                    \
                                                                \
        __chk_user_ptr(__gu_addr);                              \
-       if (!is_kernel_addr((unsigned long)__gu_addr))          \
+       if (do_allow && !is_kernel_addr((unsigned long)__gu_addr)) \
                might_fault();                                  \
-       barrier_nospec();                                       \
        if (do_allow)                                                           \
                __get_user_size(__gu_val, __gu_addr, __gu_size, __gu_err);      \
        else                                                                    \
@@ -333,10 +324,8 @@ do {                                                               \
        __typeof__(size) __gu_size = (size);                            \
                                                                        \
        might_fault();                                                  \
-       if (access_ok(__gu_addr, __gu_size)) {                          \
-               barrier_nospec();                                       \
+       if (access_ok(__gu_addr, __gu_size))                            \
                __get_user_size(__gu_val, __gu_addr, __gu_size, __gu_err); \
-       }                                                               \
        (x) = (__force __typeof__(*(ptr)))__gu_val;                             \
                                                                        \
        __gu_err;                                                       \
@@ -350,7 +339,6 @@ do {                                                                \
        __typeof__(size) __gu_size = (size);                    \
                                                                \
        __chk_user_ptr(__gu_addr);                              \
-       barrier_nospec();                                       \
        __get_user_size(__gu_val, __gu_addr, __gu_size, __gu_err); \
        (x) = (__force __typeof__(*(ptr)))__gu_val;                     \
                                                                \
@@ -395,7 +383,6 @@ raw_copy_in_user(void __user *to, const void __user *from, unsigned long n)
 {
        unsigned long ret;
 
-       barrier_nospec();
        allow_read_write_user(to, from, n);
        ret = __copy_tofrom_user(to, from, n);
        prevent_read_write_user(to, from, n);
@@ -407,72 +394,20 @@ static inline unsigned long raw_copy_from_user(void *to,
                const void __user *from, unsigned long n)
 {
        unsigned long ret;
-       if (__builtin_constant_p(n) && (n <= 8)) {
-               ret = 1;
-
-               switch (n) {
-               case 1:
-                       barrier_nospec();
-                       __get_user_size(*(u8 *)to, from, 1, ret);
-                       break;
-               case 2:
-                       barrier_nospec();
-                       __get_user_size(*(u16 *)to, from, 2, ret);
-                       break;
-               case 4:
-                       barrier_nospec();
-                       __get_user_size(*(u32 *)to, from, 4, ret);
-                       break;
-               case 8:
-                       barrier_nospec();
-                       __get_user_size(*(u64 *)to, from, 8, ret);
-                       break;
-               }
-               if (ret == 0)
-                       return 0;
-       }
 
-       barrier_nospec();
        allow_read_from_user(from, n);
        ret = __copy_tofrom_user((__force void __user *)to, from, n);
        prevent_read_from_user(from, n);
        return ret;
 }
 
-static inline unsigned long
-raw_copy_to_user_allowed(void __user *to, const void *from, unsigned long n)
-{
-       if (__builtin_constant_p(n) && (n <= 8)) {
-               unsigned long ret = 1;
-
-               switch (n) {
-               case 1:
-                       __put_user_size_allowed(*(u8 *)from, (u8 __user *)to, 1, ret);
-                       break;
-               case 2:
-                       __put_user_size_allowed(*(u16 *)from, (u16 __user *)to, 2, ret);
-                       break;
-               case 4:
-                       __put_user_size_allowed(*(u32 *)from, (u32 __user *)to, 4, ret);
-                       break;
-               case 8:
-                       __put_user_size_allowed(*(u64 *)from, (u64 __user *)to, 8, ret);
-                       break;
-               }
-               if (ret == 0)
-                       return 0;
-       }
-
-       return __copy_tofrom_user(to, (__force const void __user *)from, n);
-}
-
 static inline unsigned long
 raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
        unsigned long ret;
 
        allow_write_to_user(to, n);
-       ret = raw_copy_to_user_allowed(to, from, n);
+       ret = __copy_tofrom_user(to, (__force const void __user *)from, n);
        prevent_write_to_user(to, n);
        return ret;
 }
@@ -508,6 +443,9 @@ static __must_check inline bool user_access_begin(const void __user *ptr, size_t
 {
        if (unlikely(!access_ok(ptr, len)))
                return false;
+
+       might_fault();
+
        allow_read_write_user((void __user *)ptr, ptr, len);
        return true;
 }
@@ -521,6 +459,9 @@ user_read_access_begin(const void __user *ptr, size_t len)
 {
        if (unlikely(!access_ok(ptr, len)))
                return false;
+
+       might_fault();
+
        allow_read_from_user(ptr, len);
        return true;
 }
@@ -532,6 +473,9 @@ user_write_access_begin(const void __user *ptr, size_t len)
 {
        if (unlikely(!access_ok(ptr, len)))
                return false;
+
+       might_fault();
+
        allow_write_to_user((void __user *)ptr, len);
        return true;
 }
@@ -540,7 +484,8 @@ user_write_access_begin(const void __user *ptr, size_t len)
 
 #define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0)
 #define unsafe_get_user(x, p, e) unsafe_op_wrap(__get_user_allowed(x, p), e)
-#define unsafe_put_user(x, p, e) __put_user_goto(x, p, e)
+#define unsafe_put_user(x, p, e) \
+       __unsafe_put_user_goto((__typeof__(*(p)))(x), (p), sizeof(*(p)), e)
 
 #define unsafe_copy_to_user(d, s, l, e) \
 do {                                                                   \
@@ -550,17 +495,17 @@ do {                                                                      \
        int _i;                                                         \
                                                                        \
        for (_i = 0; _i < (_len & ~(sizeof(long) - 1)); _i += sizeof(long))             \
-               __put_user_goto(*(long*)(_src + _i), (long __user *)(_dst + _i), e);\
+               unsafe_put_user(*(long*)(_src + _i), (long __user *)(_dst + _i), e); \
        if (IS_ENABLED(CONFIG_PPC64) && (_len & 4)) {                   \
-               __put_user_goto(*(u32*)(_src + _i), (u32 __user *)(_dst + _i), e);      \
+               unsafe_put_user(*(u32*)(_src + _i), (u32 __user *)(_dst + _i), e); \
                _i += 4;                                                \
        }                                                               \
        if (_len & 2) {                                                 \
-               __put_user_goto(*(u16*)(_src + _i), (u16 __user *)(_dst + _i), e);      \
+               unsafe_put_user(*(u16*)(_src + _i), (u16 __user *)(_dst + _i), e); \
                _i += 2;                                                \
        }                                                               \
        if (_len & 1) \
-               __put_user_goto(*(u8*)(_src + _i), (u8 __user *)(_dst + _i), e);\
+               unsafe_put_user(*(u8*)(_src + _i), (u8 __user *)(_dst + _i), e); \
 } while (0)
 
 #define HAVE_GET_KERNEL_NOFAULT
index 881f655..891c9d5 100644 (file)
 #define mttbl(v)       asm volatile("mttbl %0":: "r"(v))
 #define mttbu(v)       asm volatile("mttbu %0":: "r"(v))
 
-/* For compatibility, get_tbl() is defined as get_tb() on ppc64 */
-static inline unsigned long get_tbl(void)
-{
-       return mftb();
-}
-
 static __always_inline u64 get_tb(void)
 {
        unsigned int tbhi, tblo, tbhi2;
index 454a7fc..68bfb23 100644 (file)
@@ -17,8 +17,8 @@ struct pt_regs;
 extern int xmon(struct pt_regs *excp);
 extern irqreturn_t xmon_irq(int, void *);
 #else
-static inline void xmon_setup(void) { };
-static inline void xmon_register_spus(struct list_head *list) { };
+static inline void xmon_setup(void) { }
+static inline void xmon_register_spus(struct list_head *list) { }
 #endif
 
 #if defined(CONFIG_XMON) && defined(CONFIG_SMP)
index bdf5f10..578b3ee 100644 (file)
@@ -55,17 +55,33 @@ enum perf_event_powerpc_regs {
        PERF_REG_POWERPC_MMCR3,
        PERF_REG_POWERPC_SIER2,
        PERF_REG_POWERPC_SIER3,
+       PERF_REG_POWERPC_PMC1,
+       PERF_REG_POWERPC_PMC2,
+       PERF_REG_POWERPC_PMC3,
+       PERF_REG_POWERPC_PMC4,
+       PERF_REG_POWERPC_PMC5,
+       PERF_REG_POWERPC_PMC6,
        /* Max regs without the extended regs */
        PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1,
 };
 
 #define PERF_REG_PMU_MASK      ((1ULL << PERF_REG_POWERPC_MAX) - 1)
 
-/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300 */
-#define PERF_REG_PMU_MASK_300   (((1ULL << (PERF_REG_POWERPC_MMCR2 + 1)) - 1) - PERF_REG_PMU_MASK)
-/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_31 */
-#define PERF_REG_PMU_MASK_31   (((1ULL << (PERF_REG_POWERPC_SIER3 + 1)) - 1) - PERF_REG_PMU_MASK)
+/* Exclude MMCR3, SIER2, SIER3 for CPU_FTR_ARCH_300 */
+#define        PERF_EXCLUDE_REG_EXT_300        (7ULL << PERF_REG_POWERPC_MMCR3)
 
-#define PERF_REG_MAX_ISA_300   (PERF_REG_POWERPC_MMCR2 + 1)
-#define PERF_REG_MAX_ISA_31    (PERF_REG_POWERPC_SIER3 + 1)
+/*
+ * PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300
+ * includes 9 SPRS from MMCR0 to PMC6 excluding the
+ * unsupported SPRS in PERF_EXCLUDE_REG_EXT_300.
+ */
+#define PERF_REG_PMU_MASK_300   ((0xfffULL << PERF_REG_POWERPC_MMCR0) - PERF_EXCLUDE_REG_EXT_300)
+
+/*
+ * PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_31
+ * includes 12 SPRs from MMCR0 to PMC6.
+ */
+#define PERF_REG_PMU_MASK_31   (0xfffULL << PERF_REG_POWERPC_MMCR0)
+
+#define PERF_REG_EXTENDED_MAX  (PERF_REG_POWERPC_PMC6 + 1)
 #endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
index 79ee775..6084fa4 100644 (file)
@@ -46,10 +46,10 @@ obj-y                               := cputable.o syscalls.o \
                                   prom.o traps.o setup-common.o \
                                   udbg.o misc.o io.o misc_$(BITS).o \
                                   of_platform.o prom_parse.o firmware.o \
-                                  hw_breakpoint_constraints.o
+                                  hw_breakpoint_constraints.o interrupt.o
 obj-y                          += ptrace/
 obj-$(CONFIG_PPC64)            += setup_64.o \
-                                  paca.o nvram_64.o note.o syscall_64.o
+                                  paca.o nvram_64.o note.o
 obj-$(CONFIG_COMPAT)           += sys_ppc32.o signal_32.o
 obj-$(CONFIG_VDSO32)           += vdso32_wrapper.o
 obj-$(CONFIG_PPC_WATCHDOG)     += watchdog.o
index b690c70..f3a6622 100644 (file)
@@ -255,7 +255,6 @@ int main(void)
 #endif /* CONFIG_PPC_MM_SLICES */
        OFFSET(PACA_EXGEN, paca_struct, exgen);
        OFFSET(PACA_EXMC, paca_struct, exmc);
-       OFFSET(PACA_EXSLB, paca_struct, exslb);
        OFFSET(PACA_EXNMI, paca_struct, exnmi);
 #ifdef CONFIG_PPC_PSERIES
        OFFSET(PACALPPACAPTR, paca_struct, lppaca_ptr);
@@ -309,7 +308,7 @@ int main(void)
 
        /* Interrupt register frame */
        DEFINE(INT_FRAME_SIZE, STACK_INT_FRAME_SIZE);
-       DEFINE(SWITCH_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs));
+       DEFINE(SWITCH_FRAME_SIZE, STACK_FRAME_WITH_PT_REGS);
        STACK_PT_REGS_OFFSET(GPR0, gpr[0]);
        STACK_PT_REGS_OFFSET(GPR1, gpr[1]);
        STACK_PT_REGS_OFFSET(GPR2, gpr[2]);
index 52680cf..5545c9c 100644 (file)
 #include <linux/hardirq.h>
 
 #include <asm/dbell.h>
+#include <asm/interrupt.h>
 #include <asm/irq_regs.h>
 #include <asm/kvm_ppc.h>
 #include <asm/trace.h>
 
 #ifdef CONFIG_SMP
 
-void doorbell_exception(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER_ASYNC(doorbell_exception)
 {
        struct pt_regs *old_regs = set_irq_regs(regs);
 
-       irq_enter();
        trace_doorbell_entry(regs);
 
        ppc_msgsync();
@@ -35,13 +35,12 @@ void doorbell_exception(struct pt_regs *regs)
        smp_ipi_demux_relaxed(); /* already performed the barrier */
 
        trace_doorbell_exit(regs);
-       irq_exit();
+
        set_irq_regs(old_regs);
 }
 #else /* CONFIG_SMP */
-void doorbell_exception(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER_ASYNC(doorbell_exception)
 {
        printk(KERN_WARNING "Received doorbell on non-smp system\n");
 }
 #endif /* CONFIG_SMP */
-
index 813713c..cd60bc1 100644 (file)
@@ -1596,6 +1596,35 @@ static int proc_eeh_show(struct seq_file *m, void *v)
 }
 
 #ifdef CONFIG_DEBUG_FS
+
+
+static struct pci_dev *eeh_debug_lookup_pdev(struct file *filp,
+                                            const char __user *user_buf,
+                                            size_t count, loff_t *ppos)
+{
+       uint32_t domain, bus, dev, fn;
+       struct pci_dev *pdev;
+       char buf[20];
+       int ret;
+
+       memset(buf, 0, sizeof(buf));
+       ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count);
+       if (!ret)
+               return ERR_PTR(-EFAULT);
+
+       ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn);
+       if (ret != 4) {
+               pr_err("%s: expected 4 args, got %d\n", __func__, ret);
+               return ERR_PTR(-EINVAL);
+       }
+
+       pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn);
+       if (!pdev)
+               return ERR_PTR(-ENODEV);
+
+       return pdev;
+}
+
 static int eeh_enable_dbgfs_set(void *data, u64 val)
 {
        if (val)
@@ -1688,26 +1717,13 @@ static ssize_t eeh_dev_check_write(struct file *filp,
                                const char __user *user_buf,
                                size_t count, loff_t *ppos)
 {
-       uint32_t domain, bus, dev, fn;
        struct pci_dev *pdev;
        struct eeh_dev *edev;
-       char buf[20];
        int ret;
 
-       memset(buf, 0, sizeof(buf));
-       ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count);
-       if (!ret)
-               return -EFAULT;
-
-       ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn);
-       if (ret != 4) {
-               pr_err("%s: expected 4 args, got %d\n", __func__, ret);
-               return -EINVAL;
-       }
-
-       pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn);
-       if (!pdev)
-               return -ENODEV;
+       pdev = eeh_debug_lookup_pdev(filp, user_buf, count, ppos);
+       if (IS_ERR(pdev))
+               return PTR_ERR(pdev);
 
        edev = pci_dev_to_eeh_dev(pdev);
        if (!edev) {
@@ -1717,8 +1733,8 @@ static ssize_t eeh_dev_check_write(struct file *filp,
        }
 
        ret = eeh_dev_check_failure(edev);
-       pci_info(pdev, "eeh_dev_check_failure(%04x:%02x:%02x.%01x) = %d\n",
-                       domain, bus, dev, fn, ret);
+       pci_info(pdev, "eeh_dev_check_failure(%s) = %d\n",
+                       pci_name(pdev), ret);
 
        pci_dev_put(pdev);
 
@@ -1829,25 +1845,12 @@ static ssize_t eeh_dev_break_write(struct file *filp,
                                const char __user *user_buf,
                                size_t count, loff_t *ppos)
 {
-       uint32_t domain, bus, dev, fn;
        struct pci_dev *pdev;
-       char buf[20];
        int ret;
 
-       memset(buf, 0, sizeof(buf));
-       ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count);
-       if (!ret)
-               return -EFAULT;
-
-       ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn);
-       if (ret != 4) {
-               pr_err("%s: expected 4 args, got %d\n", __func__, ret);
-               return -EINVAL;
-       }
-
-       pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn);
-       if (!pdev)
-               return -ENODEV;
+       pdev = eeh_debug_lookup_pdev(filp, user_buf, count, ppos);
+       if (IS_ERR(pdev))
+               return PTR_ERR(pdev);
 
        ret = eeh_debugfs_break_device(pdev);
        pci_dev_put(pdev);
@@ -1865,6 +1868,53 @@ static const struct file_operations eeh_dev_break_fops = {
        .read   = eeh_debugfs_dev_usage,
 };
 
+static ssize_t eeh_dev_can_recover(struct file *filp,
+                                  const char __user *user_buf,
+                                  size_t count, loff_t *ppos)
+{
+       struct pci_driver *drv;
+       struct pci_dev *pdev;
+       size_t ret;
+
+       pdev = eeh_debug_lookup_pdev(filp, user_buf, count, ppos);
+       if (IS_ERR(pdev))
+               return PTR_ERR(pdev);
+
+       /*
+        * In order for error recovery to work the driver needs to implement
+        * .error_detected(), so it can quiesce IO to the device, and
+        * .slot_reset() so it can re-initialise the device after a reset.
+        *
+        * Ideally they'd implement .resume() too, but some drivers which
+        * we need to support (notably IPR) don't so I guess we can tolerate
+        * that.
+        *
+        * .mmio_enabled() is mostly there as a work-around for devices which
+        * take forever to re-init after a hot reset. Implementing that is
+        * strictly optional.
+        */
+       drv = pci_dev_driver(pdev);
+       if (drv &&
+           drv->err_handler &&
+           drv->err_handler->error_detected &&
+           drv->err_handler->slot_reset) {
+               ret = count;
+       } else {
+               ret = -EOPNOTSUPP;
+       }
+
+       pci_dev_put(pdev);
+
+       return ret;
+}
+
+static const struct file_operations eeh_dev_can_recover_fops = {
+       .open   = simple_open,
+       .llseek = no_llseek,
+       .write  = eeh_dev_can_recover,
+       .read   = eeh_debugfs_dev_usage,
+};
+
 #endif
 
 static int __init eeh_init_proc(void)
@@ -1889,6 +1939,9 @@ static int __init eeh_init_proc(void)
                debugfs_create_file_unsafe("eeh_force_recover", 0600,
                                powerpc_debugfs_root, NULL,
                                &eeh_force_recover_fops);
+               debugfs_create_file_unsafe("eeh_dev_can_recover", 0600,
+                               powerpc_debugfs_root, NULL,
+                               &eeh_dev_can_recover_fops);
                eeh_cache_debugfs_init();
 #endif
        }
index 1c9b0cc..78c430b 100644 (file)
@@ -175,14 +175,11 @@ transfer_to_handler:
        addi    r11,r11,global_dbcr0@l
 #ifdef CONFIG_SMP
        lwz     r9,TASK_CPU(r2)
-       slwi    r9,r9,3
+       slwi    r9,r9,2
        add     r11,r11,r9
 #endif
        lwz     r12,0(r11)
        mtspr   SPRN_DBCR0,r12
-       lwz     r12,4(r11)
-       addi    r12,r12,-1
-       stw     r12,4(r11)
 #endif
 
        b       3f
@@ -276,8 +273,7 @@ reenable_mmu:
         * We save a bunch of GPRs,
         * r3 can be different from GPR3(r1) at this point, r9 and r11
         * contains the old MSR and handler address respectively,
-        * r4 & r5 can contain page fault arguments that need to be passed
-        * along as well. r0, r6-r8, r12, CCR, CTR, XER etc... are left
+        * r0, r4-r8, r12, CCR, CTR, XER etc... are left
         * clobbered as they aren't useful past this point.
         */
 
@@ -285,15 +281,11 @@ reenable_mmu:
        stw     r9,8(r1)
        stw     r11,12(r1)
        stw     r3,16(r1)
-       stw     r4,20(r1)
-       stw     r5,24(r1)
 
        /* If we are disabling interrupts (normal case), simply log it with
         * lockdep
         */
 1:     bl      trace_hardirqs_off
-       lwz     r5,24(r1)
-       lwz     r4,20(r1)
        lwz     r3,16(r1)
        lwz     r11,12(r1)
        lwz     r9,8(r1)
@@ -334,132 +326,29 @@ stack_ovf:
 _ASM_NOKPROBE_SYMBOL(stack_ovf)
 #endif
 
-#ifdef CONFIG_TRACE_IRQFLAGS
-trace_syscall_entry_irq_off:
-       /*
-        * Syscall shouldn't happen while interrupts are disabled,
-        * so let's do a warning here.
-        */
-0:     trap
-       EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING
-       bl      trace_hardirqs_on
-
-       /* Now enable for real */
-       LOAD_REG_IMMEDIATE(r10, MSR_KERNEL | MSR_EE)
-       mtmsr   r10
-
-       REST_GPR(0, r1)
-       REST_4GPRS(3, r1)
-       REST_2GPRS(7, r1)
-       b       DoSyscall
-#endif /* CONFIG_TRACE_IRQFLAGS */
-
        .globl  transfer_to_syscall
 transfer_to_syscall:
-#ifdef CONFIG_TRACE_IRQFLAGS
-       andi.   r12,r9,MSR_EE
-       beq-    trace_syscall_entry_irq_off
-#endif /* CONFIG_TRACE_IRQFLAGS */
+       SAVE_NVGPRS(r1)
+#ifdef CONFIG_PPC_BOOK3S_32
+       kuep_lock r11, r12
+#endif
 
-/*
- * Handle a system call.
- */
-       .stabs  "arch/powerpc/kernel/",N_SO,0,0,0f
-       .stabs  "entry_32.S",N_SO,0,0,0f
-0:
-
-_GLOBAL(DoSyscall)
-       stw     r3,ORIG_GPR3(r1)
-       li      r12,0
-       stw     r12,RESULT(r1)
-#ifdef CONFIG_TRACE_IRQFLAGS
-       /* Make sure interrupts are enabled */
-       mfmsr   r11
-       andi.   r12,r11,MSR_EE
-       /* We came in with interrupts disabled, we WARN and mark them enabled
-        * for lockdep now */
-0:     tweqi   r12, 0
-       EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING
-#endif /* CONFIG_TRACE_IRQFLAGS */
-       lwz     r11,TI_FLAGS(r2)
-       andi.   r11,r11,_TIF_SYSCALL_DOTRACE
-       bne-    syscall_dotrace
-syscall_dotrace_cont:
-       cmplwi  0,r0,NR_syscalls
-       lis     r10,sys_call_table@h
-       ori     r10,r10,sys_call_table@l
-       slwi    r0,r0,2
-       bge-    66f
-
-       barrier_nospec_asm
-       /*
-        * Prevent the load of the handler below (based on the user-passed
-        * system call number) being speculatively executed until the test
-        * against NR_syscalls and branch to .66f above has
-        * committed.
-        */
+       /* Calling convention has r9 = orig r0, r10 = regs */
+       addi    r10,r1,STACK_FRAME_OVERHEAD
+       mr      r9,r0
+       stw     r10,THREAD+PT_REGS(r2)
+       bl      system_call_exception
 
-       lwzx    r10,r10,r0      /* Fetch system call handler [ptr] */
-       mtlr    r10
-       addi    r9,r1,STACK_FRAME_OVERHEAD
-       PPC440EP_ERR42
-       blrl                    /* Call handler */
-       .globl  ret_from_syscall
 ret_from_syscall:
-#ifdef CONFIG_DEBUG_RSEQ
-       /* Check whether the syscall is issued inside a restartable sequence */
-       stw     r3,GPR3(r1)
-       addi    r3,r1,STACK_FRAME_OVERHEAD
-       bl      rseq_syscall
-       lwz     r3,GPR3(r1)
-#endif
-       mr      r6,r3
-       /* disable interrupts so current_thread_info()->flags can't change */
-       LOAD_REG_IMMEDIATE(r10,MSR_KERNEL)      /* doesn't include MSR_EE */
-       /* Note: We don't bother telling lockdep about it */
-       mtmsr   r10
-       lwz     r9,TI_FLAGS(r2)
-       li      r8,-MAX_ERRNO
-       andi.   r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
-       bne-    syscall_exit_work
-       cmplw   0,r3,r8
-       blt+    syscall_exit_cont
-       lwz     r11,_CCR(r1)                    /* Load CR */
-       neg     r3,r3
-       oris    r11,r11,0x1000  /* Set SO bit in CR */
-       stw     r11,_CCR(r1)
-syscall_exit_cont:
-       lwz     r8,_MSR(r1)
-#ifdef CONFIG_TRACE_IRQFLAGS
-       /* If we are going to return from the syscall with interrupts
-        * off, we trace that here. It shouldn't normally happen.
-        */
-       andi.   r10,r8,MSR_EE
-       bne+    1f
-       stw     r3,GPR3(r1)
-       bl      trace_hardirqs_off
-       lwz     r3,GPR3(r1)
-1:
-#endif /* CONFIG_TRACE_IRQFLAGS */
-#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
-       /* If the process has its own DBCR0 value, load it up.  The internal
-          debug mode bit tells us that dbcr0 should be loaded. */
-       lwz     r0,THREAD+THREAD_DBCR0(r2)
-       andis.  r10,r0,DBCR0_IDM@h
-       bnel-   load_dbcr0
-#endif
+       addi    r4,r1,STACK_FRAME_OVERHEAD
+       li      r5,0
+       bl      syscall_exit_prepare
 #ifdef CONFIG_PPC_47x
        lis     r4,icache_44x_need_flush@ha
        lwz     r5,icache_44x_need_flush@l(r4)
        cmplwi  cr0,r5,0
        bne-    2f
 #endif /* CONFIG_PPC_47x */
-1:
-BEGIN_FTR_SECTION
-       lwarx   r7,0,r1
-END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
-       stwcx.  r0,0,r1                 /* to clear the reservation */
-       ACCOUNT_CPU_USER_EXIT(r2, r5, r7)
 #ifdef CONFIG_PPC_BOOK3S_32
        kuep_unlock r5, r7
 #endif
@@ -467,21 +356,36 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
        lwz     r4,_LINK(r1)
        lwz     r5,_CCR(r1)
        mtlr    r4
-       mtcr    r5
        lwz     r7,_NIP(r1)
-       lwz     r2,GPR2(r1)
-       lwz     r1,GPR1(r1)
+       lwz     r8,_MSR(r1)
+       cmpwi   r3,0
+       lwz     r3,GPR3(r1)
 syscall_exit_finish:
-#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
-       mtspr   SPRN_NRI, r0
-#endif
        mtspr   SPRN_SRR0,r7
        mtspr   SPRN_SRR1,r8
+
+       bne     3f
+       mtcr    r5
+
+1:     lwz     r2,GPR2(r1)
+       lwz     r1,GPR1(r1)
        rfi
 #ifdef CONFIG_40x
        b .     /* Prevent prefetch past rfi */
 #endif
-_ASM_NOKPROBE_SYMBOL(syscall_exit_finish)
+
+3:     mtcr    r5
+       lwz     r4,_CTR(r1)
+       lwz     r5,_XER(r1)
+       REST_NVGPRS(r1)
+       mtctr   r4
+       mtxer   r5
+       lwz     r0,GPR0(r1)
+       lwz     r3,GPR3(r1)
+       REST_8GPRS(4,r1)
+       lwz     r12,GPR12(r1)
+       b       1b
+
 #ifdef CONFIG_44x
 2:     li      r7,0
        iccci   r0,r0
@@ -489,9 +393,6 @@ _ASM_NOKPROBE_SYMBOL(syscall_exit_finish)
        b       1b
 #endif  /* CONFIG_44x */
 
-66:    li      r3,-ENOSYS
-       b       ret_from_syscall
-
        .globl  ret_from_fork
 ret_from_fork:
        REST_NVGPRS(r1)
@@ -510,157 +411,6 @@ ret_from_kernel_thread:
        li      r3,0
        b       ret_from_syscall
 
-/* Traced system call support */
-syscall_dotrace:
-       SAVE_NVGPRS(r1)
-       li      r0,0xc00
-       stw     r0,_TRAP(r1)
-       addi    r3,r1,STACK_FRAME_OVERHEAD
-       bl      do_syscall_trace_enter
-       /*
-        * Restore argument registers possibly just changed.
-        * We use the return value of do_syscall_trace_enter
-        * for call number to look up in the table (r0).
-        */
-       mr      r0,r3
-       lwz     r3,GPR3(r1)
-       lwz     r4,GPR4(r1)
-       lwz     r5,GPR5(r1)
-       lwz     r6,GPR6(r1)
-       lwz     r7,GPR7(r1)
-       lwz     r8,GPR8(r1)
-       REST_NVGPRS(r1)
-
-       cmplwi  r0,NR_syscalls
-       /* Return code is already in r3 thanks to do_syscall_trace_enter() */
-       bge-    ret_from_syscall
-       b       syscall_dotrace_cont
-
-syscall_exit_work:
-       andi.   r0,r9,_TIF_RESTOREALL
-       beq+    0f
-       REST_NVGPRS(r1)
-       b       2f
-0:     cmplw   0,r3,r8
-       blt+    1f
-       andi.   r0,r9,_TIF_NOERROR
-       bne-    1f
-       lwz     r11,_CCR(r1)                    /* Load CR */
-       neg     r3,r3
-       oris    r11,r11,0x1000  /* Set SO bit in CR */
-       stw     r11,_CCR(r1)
-
-1:     stw     r6,RESULT(r1)   /* Save result */
-       stw     r3,GPR3(r1)     /* Update return value */
-2:     andi.   r0,r9,(_TIF_PERSYSCALL_MASK)
-       beq     4f
-
-       /* Clear per-syscall TIF flags if any are set.  */
-
-       li      r11,_TIF_PERSYSCALL_MASK
-       addi    r12,r2,TI_FLAGS
-3:     lwarx   r8,0,r12
-       andc    r8,r8,r11
-       stwcx.  r8,0,r12
-       bne-    3b
-       
-4:     /* Anything which requires enabling interrupts? */
-       andi.   r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP)
-       beq     ret_from_except
-
-       /* Re-enable interrupts. There is no need to trace that with
-        * lockdep as we are supposed to have IRQs on at this point
-        */
-       ori     r10,r10,MSR_EE
-       mtmsr   r10
-
-       /* Save NVGPRS if they're not saved already */
-       lwz     r4,_TRAP(r1)
-       andi.   r4,r4,1
-       beq     5f
-       SAVE_NVGPRS(r1)
-       li      r4,0xc00
-       stw     r4,_TRAP(r1)
-5:
-       addi    r3,r1,STACK_FRAME_OVERHEAD
-       bl      do_syscall_trace_leave
-       b       ret_from_except_full
-
-       /*
-        * System call was called from kernel. We get here with SRR1 in r9.
-        * Mark the exception as recoverable once we have retrieved SRR0,
-        * trap a warning and return ENOSYS with CR[SO] set.
-        */
-       .globl  ret_from_kernel_syscall
-ret_from_kernel_syscall:
-       mfspr   r9, SPRN_SRR0
-       mfspr   r10, SPRN_SRR1
-#if !defined(CONFIG_4xx) && !defined(CONFIG_BOOKE)
-       LOAD_REG_IMMEDIATE(r11, MSR_KERNEL & ~(MSR_IR|MSR_DR))
-       mtmsr   r11
-#endif
-
-0:     trap
-       EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING
-
-       li      r3, ENOSYS
-       crset   so
-#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
-       mtspr   SPRN_NRI, r0
-#endif
-       mtspr   SPRN_SRR0, r9
-       mtspr   SPRN_SRR1, r10
-       rfi
-#ifdef CONFIG_40x
-       b .     /* Prevent prefetch past rfi */
-#endif
-_ASM_NOKPROBE_SYMBOL(ret_from_kernel_syscall)
-
-/*
- * The fork/clone functions need to copy the full register set into
- * the child process. Therefore we need to save all the nonvolatile
- * registers (r13 - r31) before calling the C code.
- */
-       .globl  ppc_fork
-ppc_fork:
-       SAVE_NVGPRS(r1)
-       lwz     r0,_TRAP(r1)
-       rlwinm  r0,r0,0,0,30            /* clear LSB to indicate full */
-       stw     r0,_TRAP(r1)            /* register set saved */
-       b       sys_fork
-
-       .globl  ppc_vfork
-ppc_vfork:
-       SAVE_NVGPRS(r1)
-       lwz     r0,_TRAP(r1)
-       rlwinm  r0,r0,0,0,30            /* clear LSB to indicate full */
-       stw     r0,_TRAP(r1)            /* register set saved */
-       b       sys_vfork
-
-       .globl  ppc_clone
-ppc_clone:
-       SAVE_NVGPRS(r1)
-       lwz     r0,_TRAP(r1)
-       rlwinm  r0,r0,0,0,30            /* clear LSB to indicate full */
-       stw     r0,_TRAP(r1)            /* register set saved */
-       b       sys_clone
-
-       .globl  ppc_clone3
-ppc_clone3:
-       SAVE_NVGPRS(r1)
-       lwz     r0,_TRAP(r1)
-       rlwinm  r0,r0,0,0,30            /* clear LSB to indicate full */
-       stw     r0,_TRAP(r1)            /* register set saved */
-       b       sys_clone3
-
-       .globl  ppc_swapcontext
-ppc_swapcontext:
-       SAVE_NVGPRS(r1)
-       lwz     r0,_TRAP(r1)
-       rlwinm  r0,r0,0,0,30            /* clear LSB to indicate full */
-       stw     r0,_TRAP(r1)            /* register set saved */
-       b       sys_swapcontext
-
 /*
  * Top-level page fault handling.
  * This is in assembler because if do_page_fault tells us that
@@ -670,10 +420,6 @@ ppc_swapcontext:
        .globl  handle_page_fault
 handle_page_fault:
        addi    r3,r1,STACK_FRAME_OVERHEAD
-#ifdef CONFIG_PPC_BOOK3S_32
-       andis.  r0,r5,DSISR_DABRMATCH@h
-       bne-    handle_dabr_fault
-#endif
        bl      do_page_fault
        cmpwi   r3,0
        beq+    ret_from_except
@@ -681,23 +427,11 @@ handle_page_fault:
        lwz     r0,_TRAP(r1)
        clrrwi  r0,r0,1
        stw     r0,_TRAP(r1)
-       mr      r5,r3
+       mr      r4,r3           /* err arg for bad_page_fault */
        addi    r3,r1,STACK_FRAME_OVERHEAD
-       lwz     r4,_DAR(r1)
        bl      __bad_page_fault
        b       ret_from_except_full
 
-#ifdef CONFIG_PPC_BOOK3S_32
-       /* We have a data breakpoint exception - handle it */
-handle_dabr_fault:
-       SAVE_NVGPRS(r1)
-       lwz     r0,_TRAP(r1)
-       clrrwi  r0,r0,1
-       stw     r0,_TRAP(r1)
-       bl      do_break
-       b       ret_from_except_full
-#endif
-
 /*
  * This routine switches between two different tasks.  The process
  * state of one is saved on its kernel stack.  Then the state
@@ -1237,14 +971,11 @@ load_dbcr0:
        addi    r11,r11,global_dbcr0@l
 #ifdef CONFIG_SMP
        lwz     r9,TASK_CPU(r2)
-       slwi    r9,r9,3
+       slwi    r9,r9,2
        add     r11,r11,r9
 #endif
        stw     r10,0(r11)
        mtspr   SPRN_DBCR0,r0
-       lwz     r10,4(r11)
-       addi    r10,r10,1
-       stw     r10,4(r11)
        li      r11,-1
        mtspr   SPRN_DBSR,r11   /* clear all pending debug events */
        blr
@@ -1253,7 +984,7 @@ load_dbcr0:
        .align  4
        .global global_dbcr0
 global_dbcr0:
-       .space  8*NR_CPUS
+       .space  4*NR_CPUS
        .previous
 #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
 
index 33ddfee..6c4d9e2 100644 (file)
@@ -108,7 +108,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
        li      r11,\trapnr
        std     r11,_TRAP(r1)
        std     r12,_CCR(r1)
-       std     r3,ORIG_GPR3(r1)
        addi    r10,r1,STACK_FRAME_OVERHEAD
        ld      r11,exception_marker@toc(r2)
        std     r11,-16(r10)            /* "regshere" marker */
@@ -225,6 +224,12 @@ _ASM_NOKPROBE_SYMBOL(system_call_vectored_emulate)
        b       system_call_vectored_common
 #endif
 
+       .balign IFETCH_ALIGN_BYTES
+       .globl system_call_common_real
+system_call_common_real:
+       ld      r10,PACAKMSR(r13)       /* get MSR value for kernel */
+       mtmsrd  r10
+
        .balign IFETCH_ALIGN_BYTES
        .globl system_call_common
 system_call_common:
@@ -278,7 +283,6 @@ END_BTB_FLUSH_SECTION
        std     r10,_LINK(r1)
        std     r11,_TRAP(r1)
        std     r12,_CCR(r1)
-       std     r3,ORIG_GPR3(r1)
        addi    r10,r1,STACK_FRAME_OVERHEAD
        ld      r11,exception_marker@toc(r2)
        std     r11,-16(r10)            /* "regshere" marker */
index 74d07dc..e8eb999 100644 (file)
@@ -398,7 +398,6 @@ exc_##n##_common:                                                       \
        std     r10,_NIP(r1);           /* save SRR0 to stackframe */       \
        std     r11,_MSR(r1);           /* save SRR1 to stackframe */       \
        beq     2f;                     /* if from kernel mode */           \
-       ACCOUNT_CPU_USER_ENTRY(r13,r10,r11);/* accounting (uses cr0+eq) */  \
 2:     ld      r3,excf+EX_R10(r13);    /* get back r10 */                  \
        ld      r4,excf+EX_R11(r13);    /* get back r11 */                  \
        mfspr   r5,scratch;             /* get back r13 */                  \
@@ -791,7 +790,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
        EXCEPTION_COMMON_CRIT(0xd00)
        std     r14,_DSISR(r1)
        addi    r3,r1,STACK_FRAME_OVERHEAD
-       mr      r4,r14
        ld      r14,PACA_EXCRIT+EX_R14(r13)
        ld      r15,PACA_EXCRIT+EX_R15(r13)
        bl      save_nvgprs
@@ -864,7 +862,6 @@ kernel_dbg_exc:
        INTS_DISABLE
        std     r14,_DSISR(r1)
        addi    r3,r1,STACK_FRAME_OVERHEAD
-       mr      r4,r14
        ld      r14,PACA_EXDBG+EX_R14(r13)
        ld      r15,PACA_EXDBG+EX_R15(r13)
        bl      save_nvgprs
@@ -1011,8 +1008,6 @@ storage_fault_common:
        std     r14,_DAR(r1)
        std     r15,_DSISR(r1)
        addi    r3,r1,STACK_FRAME_OVERHEAD
-       mr      r4,r14
-       mr      r5,r15
        ld      r14,PACA_EXGEN+EX_R14(r13)
        ld      r15,PACA_EXGEN+EX_R15(r13)
        bl      do_page_fault
@@ -1020,9 +1015,8 @@ storage_fault_common:
        bne-    1f
        b       ret_from_except_lite
 1:     bl      save_nvgprs
-       mr      r5,r3
+       mr      r4,r3
        addi    r3,r1,STACK_FRAME_OVERHEAD
-       ld      r4,_DAR(r1)
        bl      __bad_page_fault
        b       ret_from_except
 
index 6e53f76..60d3051 100644 (file)
@@ -139,7 +139,6 @@ name:
 #define IKVM_VIRT      .L_IKVM_VIRT_\name\()   /* Virt entry tests KVM */
 #define ISTACK         .L_ISTACK_\name\()      /* Set regular kernel stack */
 #define __ISTACK(name) .L_ISTACK_ ## name
-#define IRECONCILE     .L_IRECONCILE_\name\()  /* Do RECONCILE_IRQ_STATE */
 #define IKUAP          .L_IKUAP_\name\()       /* Do KUAP lock */
 
 #define INT_DEFINE_BEGIN(n)                                            \
@@ -203,9 +202,6 @@ do_define_int n
        .ifndef ISTACK
                ISTACK=1
        .endif
-       .ifndef IRECONCILE
-               IRECONCILE=1
-       .endif
        .ifndef IKUAP
                IKUAP=1
        .endif
@@ -581,7 +577,6 @@ DEFINE_FIXED_SYMBOL(\name\()_common_real)
        kuap_save_amr_and_lock r9, r10, cr1, cr0
        .endif
        beq     101f                    /* if from kernel mode          */
-       ACCOUNT_CPU_USER_ENTRY(r13, r9, r10)
 BEGIN_FTR_SECTION
        ld      r9,IAREA+EX_PPR(r13)    /* Read PPR from paca           */
        std     r9,_PPR(r1)
@@ -649,14 +644,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
        ld      r11,exception_marker@toc(r2)
        std     r10,RESULT(r1)          /* clear regs->result           */
        std     r11,STACK_FRAME_OVERHEAD-16(r1) /* mark the frame       */
-
-       .if ISTACK
-       ACCOUNT_STOLEN_TIME
-       .endif
-
-       .if IRECONCILE
-       RECONCILE_IRQ_STATE(r10, r11)
-       .endif
 .endm
 
 /*
@@ -705,14 +692,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
        ld      r1,GPR1(r1)
 .endm
 
-#define RUNLATCH_ON                            \
-BEGIN_FTR_SECTION                              \
-       ld      r3, PACA_THREAD_INFO(r13);      \
-       ld      r4,TI_LOCAL_FLAGS(r3);          \
-       andi.   r0,r4,_TLF_RUNLATCH;            \
-       beql    ppc64_runlatch_on_trampoline;   \
-END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
-
 /*
  * When the idle code in power4_idle puts the CPU into NAP mode,
  * it has to do so in a loop, and relies on the external interrupt
@@ -935,7 +914,6 @@ INT_DEFINE_BEGIN(system_reset)
         */
        ISET_RI=0
        ISTACK=0
-       IRECONCILE=0
        IKVM_REAL=1
 INT_DEFINE_END(system_reset)
 
@@ -1022,20 +1000,6 @@ EXC_COMMON_BEGIN(system_reset_common)
        ld      r1,PACA_NMI_EMERG_SP(r13)
        subi    r1,r1,INT_FRAME_SIZE
        __GEN_COMMON_BODY system_reset
-       /*
-        * Set IRQS_ALL_DISABLED unconditionally so irqs_disabled() does
-        * the right thing. We do not want to reconcile because that goes
-        * through irq tracing which we don't want in NMI.
-        *
-        * Save PACAIRQHAPPENED to RESULT (otherwise unused), and set HARD_DIS
-        * as we are running with MSR[EE]=0.
-        */
-       li      r10,IRQS_ALL_DISABLED
-       stb     r10,PACAIRQSOFTMASK(r13)
-       lbz     r10,PACAIRQHAPPENED(r13)
-       std     r10,RESULT(r1)
-       ori     r10,r10,PACA_IRQ_HARD_DIS
-       stb     r10,PACAIRQHAPPENED(r13)
 
        addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      system_reset_exception
@@ -1051,14 +1015,6 @@ EXC_COMMON_BEGIN(system_reset_common)
        subi    r10,r10,1
        sth     r10,PACA_IN_NMI(r13)
 
-       /*
-        * Restore soft mask settings.
-        */
-       ld      r10,RESULT(r1)
-       stb     r10,PACAIRQHAPPENED(r13)
-       ld      r10,SOFTE(r1)
-       stb     r10,PACAIRQSOFTMASK(r13)
-
        kuap_kernel_restore r9, r10
        EXCEPTION_RESTORE_REGS
        RFI_TO_USER_OR_KERNEL
@@ -1123,7 +1079,6 @@ INT_DEFINE_BEGIN(machine_check_early)
        ISTACK=0
        IDAR=1
        IDSISR=1
-       IRECONCILE=0
        IKUAP=0 /* We don't touch AMR here, we never go to virtual mode */
 INT_DEFINE_END(machine_check_early)
 
@@ -1205,30 +1160,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
        li      r10,MSR_RI
        mtmsrd  r10,1
 
-       /*
-        * Set IRQS_ALL_DISABLED and save PACAIRQHAPPENED (see
-        * system_reset_common)
-        */
-       li      r10,IRQS_ALL_DISABLED
-       stb     r10,PACAIRQSOFTMASK(r13)
-       lbz     r10,PACAIRQHAPPENED(r13)
-       std     r10,RESULT(r1)
-       ori     r10,r10,PACA_IRQ_HARD_DIS
-       stb     r10,PACAIRQHAPPENED(r13)
-
        addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      machine_check_early
        std     r3,RESULT(r1)   /* Save result */
        ld      r12,_MSR(r1)
 
-       /*
-        * Restore soft mask settings.
-        */
-       ld      r10,RESULT(r1)
-       stb     r10,PACAIRQHAPPENED(r13)
-       ld      r10,SOFTE(r1)
-       stb     r10,PACAIRQSOFTMASK(r13)
-
 #ifdef CONFIG_PPC_P7_NAP
        /*
         * Check if thread was in power saving mode. We come here when any
@@ -1401,14 +1337,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
  *
  * Handling:
  * - Hash MMU
- *   Go to do_hash_page first to see if the HPT can be filled from an entry in
- *   the Linux page table. Hash faults can hit in kernel mode in a fairly
+ *   Go to do_hash_fault, which attempts to fill the HPT from an entry in the
+ *   Linux page table. Hash faults can hit in kernel mode in a fairly
  *   arbitrary state (e.g., interrupts disabled, locks held) when accessing
  *   "non-bolted" regions, e.g., vmalloc space. However these should always be
- *   backed by Linux page tables.
+ *   backed by Linux page table entries.
  *
- *   If none is found, do a Linux page fault. Linux page faults can happen in
- *   kernel mode due to user copy operations of course.
+ *   If no entry is found the Linux page fault handler is invoked (by
+ *   do_hash_fault). Linux page faults can happen in kernel mode due to user
+ *   copy operations of course.
  *
  *   KVM: The KVM HDSI handler may perform a load with MSR[DR]=1 in guest
  *   MMU context, which may cause a DSI in the host, which must go to the
@@ -1437,15 +1374,24 @@ EXC_VIRT_BEGIN(data_access, 0x4300, 0x80)
 EXC_VIRT_END(data_access, 0x4300, 0x80)
 EXC_COMMON_BEGIN(data_access_common)
        GEN_COMMON data_access
-       ld      r4,_DAR(r1)
-       ld      r5,_DSISR(r1)
+       ld      r4,_DSISR(r1)
+       addi    r3,r1,STACK_FRAME_OVERHEAD
+       andis.  r0,r4,DSISR_DABRMATCH@h
+       bne-    1f
 BEGIN_MMU_FTR_SECTION
-       ld      r6,_MSR(r1)
-       li      r3,0x300
-       b       do_hash_page            /* Try to handle as hpte fault */
+       bl      do_hash_fault
 MMU_FTR_SECTION_ELSE
-       b       handle_page_fault
+       bl      do_page_fault
 ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
+       b       interrupt_return
+
+1:     bl      do_break
+       /*
+        * do_break() may have changed the NV GPRS while handling a breakpoint.
+        * If so, we need to restore them with their updated values.
+        */
+       REST_NVGPRS(r1)
+       b       interrupt_return
 
        GEN_KVM data_access
 
@@ -1466,14 +1412,9 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
  *   on user-handler data structures.
  *
  *   KVM: Same as 0x300, DSLB must test for KVM guest.
- *
- * A dedicated save area EXSLB is used (XXX: but it actually need not be
- * these days, we could use EXGEN).
  */
 INT_DEFINE_BEGIN(data_access_slb)
        IVEC=0x380
-       IAREA=PACA_EXSLB
-       IRECONCILE=0
        IDAR=1
        IKVM_SKIP=1
        IKVM_REAL=1
@@ -1487,10 +1428,9 @@ EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80)
 EXC_VIRT_END(data_access_slb, 0x4380, 0x80)
 EXC_COMMON_BEGIN(data_access_slb_common)
        GEN_COMMON data_access_slb
-       ld      r4,_DAR(r1)
-       addi    r3,r1,STACK_FRAME_OVERHEAD
 BEGIN_MMU_FTR_SECTION
        /* HPT case, do SLB fault */
+       addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      do_slb_fault
        cmpdi   r3,0
        bne-    1f
@@ -1501,9 +1441,6 @@ MMU_FTR_SECTION_ELSE
        li      r3,-EFAULT
 ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
        std     r3,RESULT(r1)
-       RECONCILE_IRQ_STATE(r10, r11)
-       ld      r4,_DAR(r1)
-       ld      r5,RESULT(r1)
        addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      do_bad_slb_fault
        b       interrupt_return
@@ -1538,15 +1475,13 @@ EXC_VIRT_BEGIN(instruction_access, 0x4400, 0x80)
 EXC_VIRT_END(instruction_access, 0x4400, 0x80)
 EXC_COMMON_BEGIN(instruction_access_common)
        GEN_COMMON instruction_access
-       ld      r4,_DAR(r1)
-       ld      r5,_DSISR(r1)
+       addi    r3,r1,STACK_FRAME_OVERHEAD
 BEGIN_MMU_FTR_SECTION
-       ld      r6,_MSR(r1)
-       li      r3,0x400
-       b       do_hash_page            /* Try to handle as hpte fault */
+       bl      do_hash_fault
 MMU_FTR_SECTION_ELSE
-       b       handle_page_fault
+       bl      do_page_fault
 ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
+       b       interrupt_return
 
        GEN_KVM instruction_access
 
@@ -1562,8 +1497,6 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
  */
 INT_DEFINE_BEGIN(instruction_access_slb)
        IVEC=0x480
-       IAREA=PACA_EXSLB
-       IRECONCILE=0
        IISIDE=1
        IDAR=1
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
@@ -1579,10 +1512,9 @@ EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80)
 EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80)
 EXC_COMMON_BEGIN(instruction_access_slb_common)
        GEN_COMMON instruction_access_slb
-       ld      r4,_DAR(r1)
-       addi    r3,r1,STACK_FRAME_OVERHEAD
 BEGIN_MMU_FTR_SECTION
        /* HPT case, do SLB fault */
+       addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      do_slb_fault
        cmpdi   r3,0
        bne-    1f
@@ -1593,9 +1525,6 @@ MMU_FTR_SECTION_ELSE
        li      r3,-EFAULT
 ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
        std     r3,RESULT(r1)
-       RECONCILE_IRQ_STATE(r10, r11)
-       ld      r4,_DAR(r1)
-       ld      r5,RESULT(r1)
        addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      do_bad_slb_fault
        b       interrupt_return
@@ -1643,7 +1572,6 @@ EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100)
 EXC_COMMON_BEGIN(hardware_interrupt_common)
        GEN_COMMON hardware_interrupt
        FINISH_NAP
-       RUNLATCH_ON
        addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      do_IRQ
        b       interrupt_return
@@ -1697,6 +1625,51 @@ INT_DEFINE_BEGIN(program_check)
 INT_DEFINE_END(program_check)
 
 EXC_REAL_BEGIN(program_check, 0x700, 0x100)
+
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+       /*
+        * There's a short window during boot where although the kernel is
+        * running little endian, any exceptions will cause the CPU to switch
+        * back to big endian. For example a WARN() boils down to a trap
+        * instruction, which will cause a program check, and we end up here but
+        * with the CPU in big endian mode. The first instruction of the program
+        * check handler (in GEN_INT_ENTRY below) is an mtsprg, which when
+        * executed in the wrong endian is an lhzu with a ~3GB displacement from
+        * r3. The content of r3 is random, so that is a load from some random
+        * location, and depending on the system can easily lead to a checkstop,
+        * or an infinitely recursive page fault.
+        *
+        * So to handle that case we have a trampoline here that can detect we
+        * are in the wrong endian and flip us back to the correct endian. We
+        * can't flip MSR[LE] using mtmsr, so we have to use rfid. That requires
+        * backing up SRR0/1 as well as a GPR. To do that we use SPRG0/2/3, as
+        * SPRG1 is already used for the paca. SPRG3 is user readable, but this
+        * trampoline is only active very early in boot, and SPRG3 will be
+        * reinitialised in vdso_getcpu_init() before userspace starts.
+        */
+BEGIN_FTR_SECTION
+       tdi   0,0,0x48    // Trap never, or in reverse endian: b . + 8
+       b     1f          // Skip trampoline if endian is correct
+       .long 0xa643707d  // mtsprg  0, r11      Backup r11
+       .long 0xa6027a7d  // mfsrr0  r11
+       .long 0xa643727d  // mtsprg  2, r11      Backup SRR0 in SPRG2
+       .long 0xa6027b7d  // mfsrr1  r11
+       .long 0xa643737d  // mtsprg  3, r11      Backup SRR1 in SPRG3
+       .long 0xa600607d  // mfmsr   r11
+       .long 0x01006b69  // xori    r11, r11, 1 Invert MSR[LE]
+       .long 0xa6037b7d  // mtsrr1  r11
+       .long 0x34076039  // li      r11, 0x734
+       .long 0xa6037a7d  // mtsrr0  r11
+       .long 0x2400004c  // rfid
+       mfsprg r11, 3
+       mtsrr1 r11        // Restore SRR1
+       mfsprg r11, 2
+       mtsrr0 r11        // Restore SRR0
+       mfsprg r11, 0     // Restore r11
+1:
+END_FTR_SECTION(0, 1)     // nop out after boot
+#endif /* CONFIG_CPU_LITTLE_ENDIAN */
+
        GEN_INT_ENTRY program_check, virt=0
 EXC_REAL_END(program_check, 0x700, 0x100)
 EXC_VIRT_BEGIN(program_check, 0x4700, 0x100)
@@ -1755,7 +1728,6 @@ EXC_COMMON_BEGIN(program_check_common)
  */
 INT_DEFINE_BEGIN(fp_unavailable)
        IVEC=0x800
-       IRECONCILE=0
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
        IKVM_REAL=1
 #endif
@@ -1770,7 +1742,6 @@ EXC_VIRT_END(fp_unavailable, 0x4800, 0x100)
 EXC_COMMON_BEGIN(fp_unavailable_common)
        GEN_COMMON fp_unavailable
        bne     1f                      /* if from user, just load it up */
-       RECONCILE_IRQ_STATE(r10, r11)
        addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      kernel_fp_unavailable_exception
 0:     trap
@@ -1789,7 +1760,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
        b       fast_interrupt_return
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 2:     /* User process was in a transaction */
-       RECONCILE_IRQ_STATE(r10, r11)
        addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      fp_unavailable_tm
        b       interrupt_return
@@ -1832,7 +1802,6 @@ EXC_VIRT_END(decrementer, 0x4900, 0x80)
 EXC_COMMON_BEGIN(decrementer_common)
        GEN_COMMON decrementer
        FINISH_NAP
-       RUNLATCH_ON
        addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      timer_interrupt
        b       interrupt_return
@@ -1854,7 +1823,6 @@ INT_DEFINE_BEGIN(hdecrementer)
        IVEC=0x980
        IHSRR=1
        ISTACK=0
-       IRECONCILE=0
        IKVM_REAL=1
        IKVM_VIRT=1
 INT_DEFINE_END(hdecrementer)
@@ -1919,12 +1887,11 @@ EXC_VIRT_END(doorbell_super, 0x4a00, 0x100)
 EXC_COMMON_BEGIN(doorbell_super_common)
        GEN_COMMON doorbell_super
        FINISH_NAP
-       RUNLATCH_ON
        addi    r3,r1,STACK_FRAME_OVERHEAD
 #ifdef CONFIG_PPC_DOORBELL
        bl      doorbell_exception
 #else
-       bl      unknown_exception
+       bl      unknown_async_exception
 #endif
        b       interrupt_return
 
@@ -2001,12 +1968,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
        HMT_MEDIUM
 
        .if ! \virt
-       __LOAD_HANDLER(r10, system_call_common)
-       mtspr   SPRN_SRR0,r10
-       ld      r10,PACAKMSR(r13)
-       mtspr   SPRN_SRR1,r10
-       RFI_TO_KERNEL
-       b       .       /* prevent speculative execution */
+       __LOAD_HANDLER(r10, system_call_common_real)
+       mtctr   r10
+       bctr
        .else
        li      r10,MSR_RI
        mtmsrd  r10,1                   /* Set RI (EE=0) */
@@ -2137,9 +2101,7 @@ EXC_COMMON_BEGIN(h_data_storage_common)
        GEN_COMMON h_data_storage
        addi    r3,r1,STACK_FRAME_OVERHEAD
 BEGIN_MMU_FTR_SECTION
-       ld      r4,_DAR(r1)
-       li      r5,SIGSEGV
-       bl      bad_page_fault
+       bl      do_bad_page_fault_segv
 MMU_FTR_SECTION_ELSE
        bl      unknown_exception
 ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_TYPE_RADIX)
@@ -2230,7 +2192,6 @@ INT_DEFINE_BEGIN(hmi_exception_early)
        IHSRR=1
        IREALMODE_COMMON=1
        ISTACK=0
-       IRECONCILE=0
        IKUAP=0 /* We don't touch AMR here, we never go to virtual mode */
        IKVM_REAL=1
 INT_DEFINE_END(hmi_exception_early)
@@ -2277,7 +2238,6 @@ EXC_COMMON_BEGIN(hmi_exception_early_common)
 EXC_COMMON_BEGIN(hmi_exception_common)
        GEN_COMMON hmi_exception
        FINISH_NAP
-       RUNLATCH_ON
        addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      handle_hmi_exception
        b       interrupt_return
@@ -2307,12 +2267,11 @@ EXC_VIRT_END(h_doorbell, 0x4e80, 0x20)
 EXC_COMMON_BEGIN(h_doorbell_common)
        GEN_COMMON h_doorbell
        FINISH_NAP
-       RUNLATCH_ON
        addi    r3,r1,STACK_FRAME_OVERHEAD
 #ifdef CONFIG_PPC_DOORBELL
        bl      doorbell_exception
 #else
-       bl      unknown_exception
+       bl      unknown_async_exception
 #endif
        b       interrupt_return
 
@@ -2341,7 +2300,6 @@ EXC_VIRT_END(h_virt_irq, 0x4ea0, 0x20)
 EXC_COMMON_BEGIN(h_virt_irq_common)
        GEN_COMMON h_virt_irq
        FINISH_NAP
-       RUNLATCH_ON
        addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      do_IRQ
        b       interrupt_return
@@ -2388,7 +2346,6 @@ EXC_VIRT_END(performance_monitor, 0x4f00, 0x20)
 EXC_COMMON_BEGIN(performance_monitor_common)
        GEN_COMMON performance_monitor
        FINISH_NAP
-       RUNLATCH_ON
        addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      performance_monitor_exception
        b       interrupt_return
@@ -2404,7 +2361,6 @@ EXC_COMMON_BEGIN(performance_monitor_common)
  */
 INT_DEFINE_BEGIN(altivec_unavailable)
        IVEC=0xf20
-       IRECONCILE=0
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
        IKVM_REAL=1
 #endif
@@ -2434,7 +2390,6 @@ BEGIN_FTR_SECTION
        b       fast_interrupt_return
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 2:     /* User process was in a transaction */
-       RECONCILE_IRQ_STATE(r10, r11)
        addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      altivec_unavailable_tm
        b       interrupt_return
@@ -2442,7 +2397,6 @@ BEGIN_FTR_SECTION
 1:
 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 #endif
-       RECONCILE_IRQ_STATE(r10, r11)
        addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      altivec_unavailable_exception
        b       interrupt_return
@@ -2458,7 +2412,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
  */
 INT_DEFINE_BEGIN(vsx_unavailable)
        IVEC=0xf40
-       IRECONCILE=0
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
        IKVM_REAL=1
 #endif
@@ -2487,7 +2440,6 @@ BEGIN_FTR_SECTION
        b       load_up_vsx
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 2:     /* User process was in a transaction */
-       RECONCILE_IRQ_STATE(r10, r11)
        addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      vsx_unavailable_tm
        b       interrupt_return
@@ -2495,7 +2447,6 @@ BEGIN_FTR_SECTION
 1:
 END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 #endif
-       RECONCILE_IRQ_STATE(r10, r11)
        addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      vsx_unavailable_exception
        b       interrupt_return
@@ -2830,7 +2781,6 @@ EXC_VIRT_NONE(0x5800, 0x100)
 INT_DEFINE_BEGIN(soft_nmi)
        IVEC=0x900
        ISTACK=0
-       IRECONCILE=0    /* Soft-NMI may fire under local_irq_disable */
 INT_DEFINE_END(soft_nmi)
 
 /*
@@ -2849,17 +2799,6 @@ EXC_COMMON_BEGIN(soft_nmi_common)
        subi    r1,r1,INT_FRAME_SIZE
        __GEN_COMMON_BODY soft_nmi
 
-       /*
-        * Set IRQS_ALL_DISABLED and save PACAIRQHAPPENED (see
-        * system_reset_common)
-        */
-       li      r10,IRQS_ALL_DISABLED
-       stb     r10,PACAIRQSOFTMASK(r13)
-       lbz     r10,PACAIRQHAPPENED(r13)
-       std     r10,RESULT(r1)
-       ori     r10,r10,PACA_IRQ_HARD_DIS
-       stb     r10,PACAIRQHAPPENED(r13)
-
        addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      soft_nmi_interrupt
 
@@ -2867,14 +2806,6 @@ EXC_COMMON_BEGIN(soft_nmi_common)
        li      r9,0
        mtmsrd  r9,1
 
-       /*
-        * Restore soft mask settings.
-        */
-       ld      r10,RESULT(r1)
-       stb     r10,PACAIRQHAPPENED(r13)
-       ld      r10,SOFTE(r1)
-       stb     r10,PACAIRQSOFTMASK(r13)
-
        kuap_kernel_restore r9, r10
        EXCEPTION_RESTORE_REGS hsrr=0
        RFI_TO_KERNEL
@@ -3148,9 +3079,6 @@ kvmppc_skip_Hinterrupt:
         * come here.
         */
 
-EXC_COMMON_BEGIN(ppc64_runlatch_on_trampoline)
-       b       __ppc64_runlatch_on
-
 USE_FIXED_SECTION(virt_trampolines)
        /*
         * All code below __end_interrupts is treated as soft-masked. If
@@ -3221,99 +3149,3 @@ disable_machine_check:
        RFI_TO_KERNEL
 1:     mtlr    r0
        blr
-
-/*
- * Hash table stuff
- */
-       .balign IFETCH_ALIGN_BYTES
-do_hash_page:
-#ifdef CONFIG_PPC_BOOK3S_64
-       lis     r0,(DSISR_BAD_FAULT_64S | DSISR_DABRMATCH | DSISR_KEYFAULT)@h
-       ori     r0,r0,DSISR_BAD_FAULT_64S@l
-       and.    r0,r5,r0                /* weird error? */
-       bne-    handle_page_fault       /* if not, try to insert a HPTE */
-
-       /*
-        * If we are in an "NMI" (e.g., an interrupt when soft-disabled), then
-        * don't call hash_page, just fail the fault. This is required to
-        * prevent re-entrancy problems in the hash code, namely perf
-        * interrupts hitting while something holds H_PAGE_BUSY, and taking a
-        * hash fault. See the comment in hash_preload().
-        */
-       ld      r11, PACA_THREAD_INFO(r13)
-       lwz     r0,TI_PREEMPT(r11)
-       andis.  r0,r0,NMI_MASK@h
-       bne     77f
-
-       /*
-        * r3 contains the trap number
-        * r4 contains the faulting address
-        * r5 contains dsisr
-        * r6 msr
-        *
-        * at return r3 = 0 for success, 1 for page fault, negative for error
-        */
-       bl      __hash_page             /* build HPTE if possible */
-        cmpdi  r3,0                    /* see if __hash_page succeeded */
-
-       /* Success */
-       beq     interrupt_return        /* Return from exception on success */
-
-       /* Error */
-       blt-    13f
-
-       /* Reload DAR/DSISR into r4/r5 for the DABR check below */
-       ld      r4,_DAR(r1)
-       ld      r5,_DSISR(r1)
-#endif /* CONFIG_PPC_BOOK3S_64 */
-
-/* Here we have a page fault that hash_page can't handle. */
-handle_page_fault:
-11:    andis.  r0,r5,DSISR_DABRMATCH@h
-       bne-    handle_dabr_fault
-       addi    r3,r1,STACK_FRAME_OVERHEAD
-       bl      do_page_fault
-       cmpdi   r3,0
-       beq+    interrupt_return
-       mr      r5,r3
-       addi    r3,r1,STACK_FRAME_OVERHEAD
-       ld      r4,_DAR(r1)
-       bl      __bad_page_fault
-       b       interrupt_return
-
-/* We have a data breakpoint exception - handle it */
-handle_dabr_fault:
-       ld      r4,_DAR(r1)
-       ld      r5,_DSISR(r1)
-       addi    r3,r1,STACK_FRAME_OVERHEAD
-       bl      do_break
-       /*
-        * do_break() may have changed the NV GPRS while handling a breakpoint.
-        * If so, we need to restore them with their updated values.
-        */
-       REST_NVGPRS(r1)
-       b       interrupt_return
-
-
-#ifdef CONFIG_PPC_BOOK3S_64
-/* We have a page fault that hash_page could handle but HV refused
- * the PTE insertion
- */
-13:    mr      r5,r3
-       addi    r3,r1,STACK_FRAME_OVERHEAD
-       ld      r4,_DAR(r1)
-       bl      low_hash_fault
-       b       interrupt_return
-#endif
-
-/*
- * We come here as a result of a DSI at a point where we don't want
- * to call hash_page, such as when we are accessing memory (possibly
- * user memory) inside a PMU interrupt that occurred while interrupts
- * were soft-disabled.  We want to invoke the exception handler for
- * the access, or panic if there isn't a handler.
- */
-77:    addi    r3,r1,STACK_FRAME_OVERHEAD
-       li      r5,SIGSEGV
-       bl      bad_page_fault
-       b       interrupt_return
index a2f72c9..5d4706c 100644 (file)
@@ -47,7 +47,7 @@
        lwz     r1,TASK_STACK-THREAD(r1)
        addi    r1, r1, THREAD_SIZE - INT_FRAME_SIZE
 1:
-       mtcrf   0x7f, r1
+       mtcrf   0x3f, r1
        bt      32 - THREAD_ALIGN_SHIFT, stack_overflow
 #else
        subi    r11, r1, INT_FRAME_SIZE         /* use r1 if kernel */
 .endm
 
 .macro SYSCALL_ENTRY trapno
-       mfspr   r12,SPRN_SPRG_THREAD
        mfspr   r9, SPRN_SRR1
-#ifdef CONFIG_VMAP_STACK
-       mfspr   r11, SPRN_SRR0
-       mtctr   r11
-       andi.   r11, r9, MSR_PR
+       mfspr   r10, SPRN_SRR0
+       LOAD_REG_IMMEDIATE(r11, MSR_KERNEL)             /* can take exceptions */
+       lis     r12, 1f@h
+       ori     r12, r12, 1f@l
+       mtspr   SPRN_SRR1, r11
+       mtspr   SPRN_SRR0, r12
+       mfspr   r12,SPRN_SPRG_THREAD
        mr      r11, r1
        lwz     r1,TASK_STACK-THREAD(r12)
-       beq-    99f
-       addi    r1, r1, THREAD_SIZE - INT_FRAME_SIZE
-       li      r10, MSR_KERNEL & ~(MSR_IR | MSR_RI) /* can take DTLB miss */
-       mtmsr   r10
-       isync
        tovirt(r12, r12)
+       addi    r1, r1, THREAD_SIZE - INT_FRAME_SIZE
+       rfi
+1:
        stw     r11,GPR1(r1)
        stw     r11,0(r1)
        mr      r11, r1
-#else
-       andi.   r11, r9, MSR_PR
-       lwz     r11,TASK_STACK-THREAD(r12)
-       beq-    99f
-       addi    r11, r11, THREAD_SIZE - INT_FRAME_SIZE
-       tophys(r11, r11)
-       stw     r1,GPR1(r11)
-       stw     r1,0(r11)
-       tovirt(r1, r11)         /* set new kernel sp */
-#endif
+       stw     r10,_NIP(r11)
        mflr    r10
        stw     r10, _LINK(r11)
-#ifdef CONFIG_VMAP_STACK
-       mfctr   r10
-#else
-       mfspr   r10,SPRN_SRR0
-#endif
-       stw     r10,_NIP(r11)
        mfcr    r10
        rlwinm  r10,r10,0,4,2   /* Clear SO bit in CR */
        stw     r10,_CCR(r11)           /* save registers */
 #ifdef CONFIG_40x
        rlwinm  r9,r9,0,14,12           /* clear MSR_WE (necessary?) */
-#else
-#ifdef CONFIG_VMAP_STACK
-       LOAD_REG_IMMEDIATE(r10, MSR_KERNEL & ~MSR_IR) /* can take exceptions */
-#else
-       LOAD_REG_IMMEDIATE(r10, MSR_KERNEL & ~(MSR_IR|MSR_DR)) /* can take exceptions */
-#endif
-       mtmsr   r10                     /* (except for mach check in rtas) */
 #endif
        lis     r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */
        stw     r2,GPR2(r11)
        addi    r10,r10,STACK_FRAME_REGS_MARKER@l
        stw     r9,_MSR(r11)
-       li      r2, \trapno + 1
+       li      r2, \trapno
        stw     r10,8(r11)
        stw     r2,_TRAP(r11)
        SAVE_GPR(0, r11)
        SAVE_4GPRS(3, r11)
        SAVE_2GPRS(7, r11)
-       addi    r11,r1,STACK_FRAME_OVERHEAD
        addi    r2,r12,-THREAD
-       stw     r11,PT_REGS(r12)
-#if defined(CONFIG_40x)
-       /* Check to see if the dbcr0 register is set up to debug.  Use the
-          internal debug mode bit to do this. */
-       lwz     r12,THREAD_DBCR0(r12)
-       andis.  r12,r12,DBCR0_IDM@h
-#endif
-       ACCOUNT_CPU_USER_ENTRY(r2, r11, r12)
-#if defined(CONFIG_40x)
-       beq+    3f
-       /* From user and task is ptraced - load up global dbcr0 */
-       li      r12,-1                  /* clear all pending debug events */
-       mtspr   SPRN_DBSR,r12
-       lis     r11,global_dbcr0@ha
-       tophys(r11,r11)
-       addi    r11,r11,global_dbcr0@l
-       lwz     r12,0(r11)
-       mtspr   SPRN_DBCR0,r12
-       lwz     r12,4(r11)
-       addi    r12,r12,-1
-       stw     r12,4(r11)
-#endif
-
-3:
-       tovirt_novmstack r2, r2         /* set r2 to current */
-       lis     r11, transfer_to_syscall@h
-       ori     r11, r11, transfer_to_syscall@l
-#ifdef CONFIG_TRACE_IRQFLAGS
-       /*
-        * If MSR is changing we need to keep interrupts disabled at this point
-        * otherwise we might risk taking an interrupt before we tell lockdep
-        * they are enabled.
-        */
-       LOAD_REG_IMMEDIATE(r10, MSR_KERNEL)
-       rlwimi  r10, r9, 0, MSR_EE
-#else
-       LOAD_REG_IMMEDIATE(r10, MSR_KERNEL | MSR_EE)
-#endif
-#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
-       mtspr   SPRN_NRI, r0
-#endif
-       mtspr   SPRN_SRR1,r10
-       mtspr   SPRN_SRR0,r11
-       rfi                             /* jump to handler, enable MMU */
-#ifdef CONFIG_40x
-       b .     /* Prevent prefetch past rfi */
-#endif
-99:    b       ret_from_kernel_syscall
+       b       transfer_to_syscall             /* jump to handler */
 .endm
 
 .macro save_dar_dsisr_on_stack reg1, reg2, sp
index a1ae006..24724a7 100644 (file)
@@ -179,9 +179,9 @@ _ENTRY(saved_ksp_limit)
  */
        START_EXCEPTION(0x0300, DataStorage)
        EXCEPTION_PROLOG
-       mfspr   r5, SPRN_ESR            /* Grab the ESR, save it, pass arg3 */
+       mfspr   r5, SPRN_ESR            /* Grab the ESR, save it */
        stw     r5, _ESR(r11)
-       mfspr   r4, SPRN_DEAR           /* Grab the DEAR, save it, pass arg2 */
+       mfspr   r4, SPRN_DEAR           /* Grab the DEAR, save it */
        stw     r4, _DEAR(r11)
        EXC_XFER_LITE(0x300, handle_page_fault)
 
@@ -191,9 +191,9 @@ _ENTRY(saved_ksp_limit)
  */
        START_EXCEPTION(0x0400, InstructionAccess)
        EXCEPTION_PROLOG
-       mr      r4,r12                  /* Pass SRR0 as arg2 */
-       stw     r4, _DEAR(r11)
-       li      r5,0                    /* Pass zero as arg3 */
+       li      r5,0
+       stw     r5, _ESR(r11)           /* Zero ESR */
+       stw     r12, _DEAR(r11)         /* SRR0 as DEAR */
        EXC_XFER_LITE(0x400, handle_page_fault)
 
 /* 0x0500 - External Interrupt Exception */
@@ -476,6 +476,7 @@ _ENTRY(saved_ksp_limit)
 
        /* continue normal handling for a critical exception... */
 2:     mfspr   r4,SPRN_DBSR
+       stw     r4,_ESR(r11)            /* DebugException takes DBSR in _ESR */
        addi    r3,r1,STACK_FRAME_OVERHEAD
        EXC_XFER_TEMPLATE(DebugException, 0x2002, \
                (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \
index 8e36718..813fa30 100644 (file)
@@ -376,7 +376,7 @@ interrupt_base:
        /* Load the next available TLB index */
        lwz     r13,tlb_44x_index@l(r10)
 
-       bne     2f                      /* Bail if permission mismach */
+       bne     2f                      /* Bail if permission mismatch */
 
        /* Increment, rollover, and store TLB index */
        addi    r13,r13,1
@@ -471,7 +471,7 @@ interrupt_base:
        /* Load the next available TLB index */
        lwz     r13,tlb_44x_index@l(r10)
 
-       bne     2f                      /* Bail if permission mismach */
+       bne     2f                      /* Bail if permission mismatch */
 
        /* Increment, rollover, and store TLB index */
        addi    r13,r13,1
index 52702f3..46dff3f 100644 (file)
@@ -165,7 +165,7 @@ SystemCall:
 /* On the MPC8xx, this is a software emulation interrupt.  It occurs
  * for all unimplemented and illegal instructions.
  */
-       EXCEPTION(0x1000, SoftEmu, program_check_exception, EXC_XFER_STD)
+       EXCEPTION(0x1000, SoftEmu, emulation_assist_interrupt, EXC_XFER_STD)
 
        . = 0x1100
 /*
@@ -312,14 +312,14 @@ DataStoreTLBMiss:
        . = 0x1300
 InstructionTLBError:
        EXCEPTION_PROLOG
-       mr      r4,r12
        andis.  r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */
        andis.  r10,r9,SRR1_ISI_NOPT@h
        beq+    .Litlbie
-       tlbie   r4
+       tlbie   r12
        /* 0x400 is InstructionAccess exception, needed by bad_page_fault() */
 .Litlbie:
-       stw     r4, _DAR(r11)
+       stw     r12, _DAR(r11)
+       stw     r5, _DSISR(r11)
        EXC_XFER_LITE(0x400, handle_page_fault)
 
 /* This is the data TLB error on the MPC8xx.  This could be due to
@@ -364,10 +364,9 @@ do_databreakpoint:
        addi    r3,r1,STACK_FRAME_OVERHEAD
        mfspr   r4,SPRN_BAR
        stw     r4,_DAR(r11)
-#ifdef CONFIG_VMAP_STACK
-       lwz     r5,_DSISR(r11)
-#else
+#ifndef CONFIG_VMAP_STACK
        mfspr   r5,SPRN_DSISR
+       stw     r5,_DSISR(r11)
 #endif
        EXC_XFER_STD(0x1c00, do_break)
 
index 858fbc8..727fdab 100644 (file)
@@ -238,8 +238,8 @@ __secondary_hold_acknowledge:
 
 /* System reset */
 /* core99 pmac starts the seconary here by changing the vector, and
-   putting it back to what it was (unknown_exception) when done.  */
-       EXCEPTION(0x100, Reset, unknown_exception, EXC_XFER_STD)
+   putting it back to what it was (unknown_async_exception) when done.  */
+       EXCEPTION(0x100, Reset, unknown_async_exception, EXC_XFER_STD)
 
 /* Machine check */
 /*
@@ -278,12 +278,6 @@ MachineCheck:
 7:     EXCEPTION_PROLOG_2
        addi    r3,r1,STACK_FRAME_OVERHEAD
 #ifdef CONFIG_PPC_CHRP
-#ifdef CONFIG_VMAP_STACK
-       mfspr   r4, SPRN_SPRG_THREAD
-       tovirt(r4, r4)
-       lwz     r4, RTAS_SP(r4)
-       cmpwi   cr1, r4, 0
-#endif
        beq     cr1, machine_check_tramp
        twi     31, 0, 0
 #else
@@ -295,6 +289,7 @@ MachineCheck:
        DO_KVM  0x300
 DataAccess:
 #ifdef CONFIG_VMAP_STACK
+#ifdef CONFIG_PPC_BOOK3S_604
 BEGIN_MMU_FTR_SECTION
        mtspr   SPRN_SPRG_SCRATCH2,r10
        mfspr   r10, SPRN_SPRG_THREAD
@@ -311,12 +306,14 @@ BEGIN_MMU_FTR_SECTION
 MMU_FTR_SECTION_ELSE
        b       1f
 ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_HPTE_TABLE)
+#endif
 1:     EXCEPTION_PROLOG_0 handle_dar_dsisr=1
        EXCEPTION_PROLOG_1
        b       handle_page_fault_tramp_1
 #else  /* CONFIG_VMAP_STACK */
        EXCEPTION_PROLOG handle_dar_dsisr=1
        get_and_save_dar_dsisr_on_stack r4, r5, r11
+#ifdef CONFIG_PPC_BOOK3S_604
 BEGIN_MMU_FTR_SECTION
        andis.  r0, r5, (DSISR_BAD_FAULT_32S | DSISR_DABRMATCH)@h
        bne     handle_page_fault_tramp_2       /* if not, try to put a PTE */
@@ -324,8 +321,11 @@ BEGIN_MMU_FTR_SECTION
        bl      hash_page
        b       handle_page_fault_tramp_1
 MMU_FTR_SECTION_ELSE
+#endif
        b       handle_page_fault_tramp_2
+#ifdef CONFIG_PPC_BOOK3S_604
 ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_HPTE_TABLE)
+#endif
 #endif /* CONFIG_VMAP_STACK */
 
 /* Instruction access exception. */
@@ -341,12 +341,14 @@ InstructionAccess:
        mfspr   r11, SPRN_SRR1          /* check whether user or kernel */
        stw     r11, SRR1(r10)
        mfcr    r10
+#ifdef CONFIG_PPC_BOOK3S_604
 BEGIN_MMU_FTR_SECTION
        andis.  r11, r11, SRR1_ISI_NOPT@h       /* no pte found? */
        bne     hash_page_isi
 .Lhash_page_isi_cont:
        mfspr   r11, SPRN_SRR1          /* check whether user or kernel */
 END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
+#endif
        andi.   r11, r11, MSR_PR
 
        EXCEPTION_PROLOG_1
@@ -357,13 +359,15 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
        beq     1f                      /* if so, try to put a PTE */
        li      r3,0                    /* into the hash table */
        mr      r4,r12                  /* SRR0 is fault address */
+#ifdef CONFIG_PPC_BOOK3S_604
 BEGIN_MMU_FTR_SECTION
        bl      hash_page
 END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
+#endif
 #endif /* CONFIG_VMAP_STACK */
-1:     mr      r4,r12
        andis.  r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */
-       stw     r4, _DAR(r11)
+       stw     r5, _DSISR(r11)
+       stw     r12, _DAR(r11)
        EXC_XFER_LITE(0x400, handle_page_fault)
 
 /* External interrupt */
@@ -640,7 +644,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU)
 #endif
 
 #ifndef CONFIG_TAU_INT
-#define TAUException   unknown_exception
+#define TAUException   unknown_async_exception
 #endif
 
        EXCEPTION(0x1300, Trap_13, instruction_breakpoint_exception, EXC_XFER_STD)
@@ -685,13 +689,16 @@ handle_page_fault_tramp_1:
 #ifdef CONFIG_VMAP_STACK
        EXCEPTION_PROLOG_2 handle_dar_dsisr=1
 #endif
-       lwz     r4, _DAR(r11)
        lwz     r5, _DSISR(r11)
        /* fall through */
 handle_page_fault_tramp_2:
+       andis.  r0, r5, DSISR_DABRMATCH@h
+       bne-    1f
        EXC_XFER_LITE(0x300, handle_page_fault)
+1:     EXC_XFER_STD(0x300, do_break)
 
 #ifdef CONFIG_VMAP_STACK
+#ifdef CONFIG_PPC_BOOK3S_604
 .macro save_regs_thread                thread
        stw     r0, THR0(\thread)
        stw     r3, THR3(\thread)
@@ -763,6 +770,7 @@ fast_hash_page_return:
        mfspr   r11, SPRN_SPRG_SCRATCH1
        mfspr   r10, SPRN_SPRG_SCRATCH0
        rfi
+#endif /* CONFIG_PPC_BOOK3S_604 */
 
 stack_overflow:
        vmap_stack_overflow_exception
index 74e230c..4785779 100644 (file)
@@ -106,10 +106,8 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV)
 #endif
        mfspr   r9, SPRN_SRR1
        BOOKE_CLEAR_BTB(r11)
-       andi.   r11, r9, MSR_PR
        lwz     r11, TASK_STACK - THREAD(r10)
        rlwinm  r12,r12,0,4,2   /* Clear SO bit in CR */
-       beq-    99f
        ALLOC_STACK_FRAME(r11, THREAD_SIZE - INT_FRAME_SIZE)
        stw     r12, _CCR(r11)          /* save various registers */
        mflr    r12
@@ -124,60 +122,15 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV)
        stw     r2,GPR2(r11)
        addi    r12, r12, STACK_FRAME_REGS_MARKER@l
        stw     r9,_MSR(r11)
-       li      r2, \trapno + 1
+       li      r2, \trapno
        stw     r12, 8(r11)
        stw     r2,_TRAP(r11)
        SAVE_GPR(0, r11)
        SAVE_4GPRS(3, r11)
        SAVE_2GPRS(7, r11)
 
-       addi    r11,r1,STACK_FRAME_OVERHEAD
        addi    r2,r10,-THREAD
-       stw     r11,PT_REGS(r10)
-       /* Check to see if the dbcr0 register is set up to debug.  Use the
-          internal debug mode bit to do this. */
-       lwz     r12,THREAD_DBCR0(r10)
-       andis.  r12,r12,DBCR0_IDM@h
-       ACCOUNT_CPU_USER_ENTRY(r2, r11, r12)
-       beq+    3f
-       /* From user and task is ptraced - load up global dbcr0 */
-       li      r12,-1                  /* clear all pending debug events */
-       mtspr   SPRN_DBSR,r12
-       lis     r11,global_dbcr0@ha
-       tophys(r11,r11)
-       addi    r11,r11,global_dbcr0@l
-#ifdef CONFIG_SMP
-       lwz     r10, TASK_CPU(r2)
-       slwi    r10, r10, 3
-       add     r11, r11, r10
-#endif
-       lwz     r12,0(r11)
-       mtspr   SPRN_DBCR0,r12
-       lwz     r12,4(r11)
-       addi    r12,r12,-1
-       stw     r12,4(r11)
-
-3:
-       tovirt(r2, r2)                  /* set r2 to current */
-       lis     r11, transfer_to_syscall@h
-       ori     r11, r11, transfer_to_syscall@l
-#ifdef CONFIG_TRACE_IRQFLAGS
-       /*
-        * If MSR is changing we need to keep interrupts disabled at this point
-        * otherwise we might risk taking an interrupt before we tell lockdep
-        * they are enabled.
-        */
-       lis     r10, MSR_KERNEL@h
-       ori     r10, r10, MSR_KERNEL@l
-       rlwimi  r10, r9, 0, MSR_EE
-#else
-       lis     r10, (MSR_KERNEL | MSR_EE)@h
-       ori     r10, r10, (MSR_KERNEL | MSR_EE)@l
-#endif
-       mtspr   SPRN_SRR1,r10
-       mtspr   SPRN_SRR0,r11
-       rfi                             /* jump to handler, enable MMU */
-99:    b       ret_from_kernel_syscall
+       b       transfer_to_syscall     /* jump to handler */
 .endm
 
 /* To handle the additional exception priority levels on 40x and Book-E
@@ -406,6 +359,7 @@ label:
                                                                              \
        /* continue normal handling for a debug exception... */               \
 2:     mfspr   r4,SPRN_DBSR;                                                 \
+       stw     r4,_ESR(r11);           /* DebugException takes DBSR in _ESR */\
        addi    r3,r1,STACK_FRAME_OVERHEAD;                                   \
        EXC_XFER_TEMPLATE(DebugException, 0x2008, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), debug_transfer_to_handler, ret_from_debug_exc)
 
@@ -459,6 +413,7 @@ label:
                                                                              \
        /* continue normal handling for a critical exception... */            \
 2:     mfspr   r4,SPRN_DBSR;                                                 \
+       stw     r4,_ESR(r11);           /* DebugException takes DBSR in _ESR */\
        addi    r3,r1,STACK_FRAME_OVERHEAD;                                   \
        EXC_XFER_TEMPLATE(DebugException, 0x2002, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), crit_transfer_to_handler, ret_from_crit_exc)
 
@@ -476,9 +431,7 @@ label:
        NORMAL_EXCEPTION_PROLOG(INST_STORAGE);                \
        mfspr   r5,SPRN_ESR;            /* Grab the ESR and save it */        \
        stw     r5,_ESR(r11);                                                 \
-       mr      r4,r12;                 /* Pass SRR0 as arg2 */               \
-       stw     r4, _DEAR(r11);                                               \
-       li      r5,0;                   /* Pass zero as arg3 */               \
+       stw     r12, _DEAR(r11);        /* Pass SRR0 as arg2 */               \
        EXC_XFER_LITE(0x0400, handle_page_fault)
 
 #define ALIGNMENT_EXCEPTION                                                  \
index fdd4d27..3f4a40c 100644 (file)
@@ -364,12 +364,12 @@ interrupt_base:
        /* Data Storage Interrupt */
        START_EXCEPTION(DataStorage)
        NORMAL_EXCEPTION_PROLOG(DATA_STORAGE)
-       mfspr   r5,SPRN_ESR             /* Grab the ESR, save it, pass arg3 */
+       mfspr   r5,SPRN_ESR             /* Grab the ESR, save it */
        stw     r5,_ESR(r11)
-       mfspr   r4,SPRN_DEAR            /* Grab the DEAR, save it, pass arg2 */
+       mfspr   r4,SPRN_DEAR            /* Grab the DEAR, save it */
+       stw     r4, _DEAR(r11)
        andis.  r10,r5,(ESR_ILK|ESR_DLK)@h
        bne     1f
-       stw     r4, _DEAR(r11)
        EXC_XFER_LITE(0x0300, handle_page_fault)
 1:
        addi    r3,r1,STACK_FRAME_OVERHEAD
index 22f249b..f9e6d83 100644 (file)
@@ -52,28 +52,32 @@ _GLOBAL(isa300_idle_stop_mayloss)
        std     r1,PACAR1(r13)
        mflr    r4
        mfcr    r5
-       /* use stack red zone rather than a new frame for saving regs */
-       std     r2,-8*0(r1)
-       std     r14,-8*1(r1)
-       std     r15,-8*2(r1)
-       std     r16,-8*3(r1)
-       std     r17,-8*4(r1)
-       std     r18,-8*5(r1)
-       std     r19,-8*6(r1)
-       std     r20,-8*7(r1)
-       std     r21,-8*8(r1)
-       std     r22,-8*9(r1)
-       std     r23,-8*10(r1)
-       std     r24,-8*11(r1)
-       std     r25,-8*12(r1)
-       std     r26,-8*13(r1)
-       std     r27,-8*14(r1)
-       std     r28,-8*15(r1)
-       std     r29,-8*16(r1)
-       std     r30,-8*17(r1)
-       std     r31,-8*18(r1)
-       std     r4,-8*19(r1)
-       std     r5,-8*20(r1)
+       /*
+        * Use the stack red zone rather than a new frame for saving regs since
+        * in the case of no GPR loss the wakeup code branches directly back to
+        * the caller without deallocating the stack frame first.
+        */
+       std     r2,-8*1(r1)
+       std     r14,-8*2(r1)
+       std     r15,-8*3(r1)
+       std     r16,-8*4(r1)
+       std     r17,-8*5(r1)
+       std     r18,-8*6(r1)
+       std     r19,-8*7(r1)
+       std     r20,-8*8(r1)
+       std     r21,-8*9(r1)
+       std     r22,-8*10(r1)
+       std     r23,-8*11(r1)
+       std     r24,-8*12(r1)
+       std     r25,-8*13(r1)
+       std     r26,-8*14(r1)
+       std     r27,-8*15(r1)
+       std     r28,-8*16(r1)
+       std     r29,-8*17(r1)
+       std     r30,-8*18(r1)
+       std     r31,-8*19(r1)
+       std     r4,-8*20(r1)
+       std     r5,-8*21(r1)
        /* 168 bytes */
        PPC_STOP
        b       .       /* catch bugs */
@@ -89,8 +93,8 @@ _GLOBAL(isa300_idle_stop_mayloss)
  */
 _GLOBAL(idle_return_gpr_loss)
        ld      r1,PACAR1(r13)
-       ld      r4,-8*19(r1)
-       ld      r5,-8*20(r1)
+       ld      r4,-8*20(r1)
+       ld      r5,-8*21(r1)
        mtlr    r4
        mtcr    r5
        /*
@@ -98,25 +102,25 @@ _GLOBAL(idle_return_gpr_loss)
         * from PACATOC. This could be avoided for that less common case
         * if KVM saved its r2.
         */
-       ld      r2,-8*0(r1)
-       ld      r14,-8*1(r1)
-       ld      r15,-8*2(r1)
-       ld      r16,-8*3(r1)
-       ld      r17,-8*4(r1)
-       ld      r18,-8*5(r1)
-       ld      r19,-8*6(r1)
-       ld      r20,-8*7(r1)
-       ld      r21,-8*8(r1)
-       ld      r22,-8*9(r1)
-       ld      r23,-8*10(r1)
-       ld      r24,-8*11(r1)
-       ld      r25,-8*12(r1)
-       ld      r26,-8*13(r1)
-       ld      r27,-8*14(r1)
-       ld      r28,-8*15(r1)
-       ld      r29,-8*16(r1)
-       ld      r30,-8*17(r1)
-       ld      r31,-8*18(r1)
+       ld      r2,-8*1(r1)
+       ld      r14,-8*2(r1)
+       ld      r15,-8*3(r1)
+       ld      r16,-8*4(r1)
+       ld      r17,-8*5(r1)
+       ld      r18,-8*6(r1)
+       ld      r19,-8*7(r1)
+       ld      r20,-8*8(r1)
+       ld      r21,-8*9(r1)
+       ld      r22,-8*10(r1)
+       ld      r23,-8*11(r1)
+       ld      r24,-8*12(r1)
+       ld      r25,-8*13(r1)
+       ld      r26,-8*14(r1)
+       ld      r27,-8*15(r1)
+       ld      r28,-8*16(r1)
+       ld      r29,-8*17(r1)
+       ld      r30,-8*18(r1)
+       ld      r31,-8*19(r1)
        blr
 
 /*
@@ -154,28 +158,32 @@ _GLOBAL(isa206_idle_insn_mayloss)
        std     r1,PACAR1(r13)
        mflr    r4
        mfcr    r5
-       /* use stack red zone rather than a new frame for saving regs */
-       std     r2,-8*0(r1)
-       std     r14,-8*1(r1)
-       std     r15,-8*2(r1)
-       std     r16,-8*3(r1)
-       std     r17,-8*4(r1)
-       std     r18,-8*5(r1)
-       std     r19,-8*6(r1)
-       std     r20,-8*7(r1)
-       std     r21,-8*8(r1)
-       std     r22,-8*9(r1)
-       std     r23,-8*10(r1)
-       std     r24,-8*11(r1)
-       std     r25,-8*12(r1)
-       std     r26,-8*13(r1)
-       std     r27,-8*14(r1)
-       std     r28,-8*15(r1)
-       std     r29,-8*16(r1)
-       std     r30,-8*17(r1)
-       std     r31,-8*18(r1)
-       std     r4,-8*19(r1)
-       std     r5,-8*20(r1)
+       /*
+        * Use the stack red zone rather than a new frame for saving regs since
+        * in the case of no GPR loss the wakeup code branches directly back to
+        * the caller without deallocating the stack frame first.
+        */
+       std     r2,-8*1(r1)
+       std     r14,-8*2(r1)
+       std     r15,-8*3(r1)
+       std     r16,-8*4(r1)
+       std     r17,-8*5(r1)
+       std     r18,-8*6(r1)
+       std     r19,-8*7(r1)
+       std     r20,-8*8(r1)
+       std     r21,-8*9(r1)
+       std     r22,-8*10(r1)
+       std     r23,-8*11(r1)
+       std     r24,-8*12(r1)
+       std     r25,-8*13(r1)
+       std     r26,-8*14(r1)
+       std     r27,-8*15(r1)
+       std     r28,-8*16(r1)
+       std     r29,-8*17(r1)
+       std     r30,-8*18(r1)
+       std     r31,-8*19(r1)
+       std     r4,-8*20(r1)
+       std     r5,-8*21(r1)
        cmpwi   r3,PNV_THREAD_NAP
        bne     1f
        IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP)
diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
new file mode 100644 (file)
index 0000000..398cd86
--- /dev/null
@@ -0,0 +1,512 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/context_tracking.h>
+#include <linux/err.h>
+#include <linux/compat.h>
+
+#include <asm/asm-prototypes.h>
+#include <asm/kup.h>
+#include <asm/cputime.h>
+#include <asm/interrupt.h>
+#include <asm/hw_irq.h>
+#include <asm/interrupt.h>
+#include <asm/kprobes.h>
+#include <asm/paca.h>
+#include <asm/ptrace.h>
+#include <asm/reg.h>
+#include <asm/signal.h>
+#include <asm/switch_to.h>
+#include <asm/syscall.h>
+#include <asm/time.h>
+#include <asm/unistd.h>
+
+typedef long (*syscall_fn)(long, long, long, long, long, long);
+
+/* Has to run notrace because it is entered not completely "reconciled" */
+notrace long system_call_exception(long r3, long r4, long r5,
+                                  long r6, long r7, long r8,
+                                  unsigned long r0, struct pt_regs *regs)
+{
+       syscall_fn f;
+
+       regs->orig_gpr3 = r3;
+
+       if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
+               BUG_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED);
+
+       CT_WARN_ON(ct_state() == CONTEXT_KERNEL);
+       user_exit_irqoff();
+
+       trace_hardirqs_off(); /* finish reconciling */
+
+       if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x))
+               BUG_ON(!(regs->msr & MSR_RI));
+       BUG_ON(!(regs->msr & MSR_PR));
+       BUG_ON(!FULL_REGS(regs));
+       BUG_ON(arch_irq_disabled_regs(regs));
+
+#ifdef CONFIG_PPC_PKEY
+       if (mmu_has_feature(MMU_FTR_PKEY)) {
+               unsigned long amr, iamr;
+               bool flush_needed = false;
+               /*
+                * When entering from userspace we mostly have the AMR/IAMR
+                * different from kernel default values. Hence don't compare.
+                */
+               amr = mfspr(SPRN_AMR);
+               iamr = mfspr(SPRN_IAMR);
+               regs->amr  = amr;
+               regs->iamr = iamr;
+               if (mmu_has_feature(MMU_FTR_BOOK3S_KUAP)) {
+                       mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
+                       flush_needed = true;
+               }
+               if (mmu_has_feature(MMU_FTR_BOOK3S_KUEP)) {
+                       mtspr(SPRN_IAMR, AMR_KUEP_BLOCKED);
+                       flush_needed = true;
+               }
+               if (flush_needed)
+                       isync();
+       } else
+#endif
+#ifdef CONFIG_PPC64
+               kuap_check_amr();
+#endif
+
+       booke_restore_dbcr0();
+
+       account_cpu_user_entry();
+
+       account_stolen_time();
+
+       /*
+        * This is not required for the syscall exit path, but makes the
+        * stack frame look nicer. If this was initialised in the first stack
+        * frame, or if the unwinder was taught the first stack frame always
+        * returns to user with IRQS_ENABLED, this store could be avoided!
+        */
+       irq_soft_mask_regs_set_state(regs, IRQS_ENABLED);
+
+       local_irq_enable();
+
+       if (unlikely(current_thread_info()->flags & _TIF_SYSCALL_DOTRACE)) {
+               if (unlikely(trap_is_unsupported_scv(regs))) {
+                       /* Unsupported scv vector */
+                       _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
+                       return regs->gpr[3];
+               }
+               /*
+                * We use the return value of do_syscall_trace_enter() as the
+                * syscall number. If the syscall was rejected for any reason
+                * do_syscall_trace_enter() returns an invalid syscall number
+                * and the test against NR_syscalls will fail and the return
+                * value to be used is in regs->gpr[3].
+                */
+               r0 = do_syscall_trace_enter(regs);
+               if (unlikely(r0 >= NR_syscalls))
+                       return regs->gpr[3];
+               r3 = regs->gpr[3];
+               r4 = regs->gpr[4];
+               r5 = regs->gpr[5];
+               r6 = regs->gpr[6];
+               r7 = regs->gpr[7];
+               r8 = regs->gpr[8];
+
+       } else if (unlikely(r0 >= NR_syscalls)) {
+               if (unlikely(trap_is_unsupported_scv(regs))) {
+                       /* Unsupported scv vector */
+                       _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
+                       return regs->gpr[3];
+               }
+               return -ENOSYS;
+       }
+
+       /* May be faster to do array_index_nospec? */
+       barrier_nospec();
+
+       if (unlikely(is_compat_task())) {
+               f = (void *)compat_sys_call_table[r0];
+
+               r3 &= 0x00000000ffffffffULL;
+               r4 &= 0x00000000ffffffffULL;
+               r5 &= 0x00000000ffffffffULL;
+               r6 &= 0x00000000ffffffffULL;
+               r7 &= 0x00000000ffffffffULL;
+               r8 &= 0x00000000ffffffffULL;
+
+       } else {
+               f = (void *)sys_call_table[r0];
+       }
+
+       return f(r3, r4, r5, r6, r7, r8);
+}
+
+/*
+ * local irqs must be disabled. Returns false if the caller must re-enable
+ * them, check for new work, and try again.
+ *
+ * This should be called with local irqs disabled, but if they were previously
+ * enabled when the interrupt handler returns (indicating a process-context /
+ * synchronous interrupt) then irqs_enabled should be true.
+ */
+static notrace inline bool __prep_irq_for_enabled_exit(bool clear_ri)
+{
+       /* This must be done with RI=1 because tracing may touch vmaps */
+       trace_hardirqs_on();
+
+       /* This pattern matches prep_irq_for_idle */
+       if (clear_ri)
+               __hard_EE_RI_disable();
+       else
+               __hard_irq_disable();
+#ifdef CONFIG_PPC64
+       if (unlikely(lazy_irq_pending_nocheck())) {
+               /* Took an interrupt, may have more exit work to do. */
+               if (clear_ri)
+                       __hard_RI_enable();
+               trace_hardirqs_off();
+               local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+
+               return false;
+       }
+       local_paca->irq_happened = 0;
+       irq_soft_mask_set(IRQS_ENABLED);
+#endif
+       return true;
+}
+
+static notrace inline bool prep_irq_for_enabled_exit(bool clear_ri, bool irqs_enabled)
+{
+       if (__prep_irq_for_enabled_exit(clear_ri))
+               return true;
+
+       /*
+        * Must replay pending soft-masked interrupts now. Don't just
+        * local_irq_enabe(); local_irq_disable(); because if we are
+        * returning from an asynchronous interrupt here, another one
+        * might hit after irqs are enabled, and it would exit via this
+        * same path allowing another to fire, and so on unbounded.
+        *
+        * If interrupts were enabled when this interrupt exited,
+        * indicating a process context (synchronous) interrupt,
+        * local_irq_enable/disable can be used, which will enable
+        * interrupts rather than keeping them masked (unclear how
+        * much benefit this is over just replaying for all cases,
+        * because we immediately disable again, so all we're really
+        * doing is allowing hard interrupts to execute directly for
+        * a very small time, rather than being masked and replayed).
+        */
+       if (irqs_enabled) {
+               local_irq_enable();
+               local_irq_disable();
+       } else {
+               replay_soft_interrupts();
+       }
+
+       return false;
+}
+
+static notrace void booke_load_dbcr0(void)
+{
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+       unsigned long dbcr0 = current->thread.debug.dbcr0;
+
+       if (likely(!(dbcr0 & DBCR0_IDM)))
+               return;
+
+       /*
+        * Check to see if the dbcr0 register is set up to debug.
+        * Use the internal debug mode bit to do this.
+        */
+       mtmsr(mfmsr() & ~MSR_DE);
+       if (IS_ENABLED(CONFIG_PPC32)) {
+               isync();
+               global_dbcr0[smp_processor_id()] = mfspr(SPRN_DBCR0);
+       }
+       mtspr(SPRN_DBCR0, dbcr0);
+       mtspr(SPRN_DBSR, -1);
+#endif
+}
+
+/*
+ * This should be called after a syscall returns, with r3 the return value
+ * from the syscall. If this function returns non-zero, the system call
+ * exit assembly should additionally load all GPR registers and CTR and XER
+ * from the interrupt frame.
+ *
+ * The function graph tracer can not trace the return side of this function,
+ * because RI=0 and soft mask state is "unreconciled", so it is marked notrace.
+ */
+notrace unsigned long syscall_exit_prepare(unsigned long r3,
+                                          struct pt_regs *regs,
+                                          long scv)
+{
+       unsigned long ti_flags;
+       unsigned long ret = 0;
+       bool is_not_scv = !IS_ENABLED(CONFIG_PPC_BOOK3S_64) || !scv;
+
+       CT_WARN_ON(ct_state() == CONTEXT_USER);
+
+#ifdef CONFIG_PPC64
+       kuap_check_amr();
+#endif
+
+       regs->result = r3;
+
+       /* Check whether the syscall is issued inside a restartable sequence */
+       rseq_syscall(regs);
+
+       ti_flags = current_thread_info()->flags;
+
+       if (unlikely(r3 >= (unsigned long)-MAX_ERRNO) && is_not_scv) {
+               if (likely(!(ti_flags & (_TIF_NOERROR | _TIF_RESTOREALL)))) {
+                       r3 = -r3;
+                       regs->ccr |= 0x10000000; /* Set SO bit in CR */
+               }
+       }
+
+       if (unlikely(ti_flags & _TIF_PERSYSCALL_MASK)) {
+               if (ti_flags & _TIF_RESTOREALL)
+                       ret = _TIF_RESTOREALL;
+               else
+                       regs->gpr[3] = r3;
+               clear_bits(_TIF_PERSYSCALL_MASK, &current_thread_info()->flags);
+       } else {
+               regs->gpr[3] = r3;
+       }
+
+       if (unlikely(ti_flags & _TIF_SYSCALL_DOTRACE)) {
+               do_syscall_trace_leave(regs);
+               ret |= _TIF_RESTOREALL;
+       }
+
+       local_irq_disable();
+
+again:
+       ti_flags = READ_ONCE(current_thread_info()->flags);
+       while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
+               local_irq_enable();
+               if (ti_flags & _TIF_NEED_RESCHED) {
+                       schedule();
+               } else {
+                       /*
+                        * SIGPENDING must restore signal handler function
+                        * argument GPRs, and some non-volatiles (e.g., r1).
+                        * Restore all for now. This could be made lighter.
+                        */
+                       if (ti_flags & _TIF_SIGPENDING)
+                               ret |= _TIF_RESTOREALL;
+                       do_notify_resume(regs, ti_flags);
+               }
+               local_irq_disable();
+               ti_flags = READ_ONCE(current_thread_info()->flags);
+       }
+
+       if (IS_ENABLED(CONFIG_PPC_BOOK3S) && IS_ENABLED(CONFIG_PPC_FPU)) {
+               if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
+                               unlikely((ti_flags & _TIF_RESTORE_TM))) {
+                       restore_tm_state(regs);
+               } else {
+                       unsigned long mathflags = MSR_FP;
+
+                       if (cpu_has_feature(CPU_FTR_VSX))
+                               mathflags |= MSR_VEC | MSR_VSX;
+                       else if (cpu_has_feature(CPU_FTR_ALTIVEC))
+                               mathflags |= MSR_VEC;
+
+                       /*
+                        * If userspace MSR has all available FP bits set,
+                        * then they are live and no need to restore. If not,
+                        * it means the regs were given up and restore_math
+                        * may decide to restore them (to avoid taking an FP
+                        * fault).
+                        */
+                       if ((regs->msr & mathflags) != mathflags)
+                               restore_math(regs);
+               }
+       }
+
+       user_enter_irqoff();
+
+       /* scv need not set RI=0 because SRRs are not used */
+       if (unlikely(!__prep_irq_for_enabled_exit(is_not_scv))) {
+               user_exit_irqoff();
+               local_irq_enable();
+               local_irq_disable();
+               goto again;
+       }
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       local_paca->tm_scratch = regs->msr;
+#endif
+
+       booke_load_dbcr0();
+
+       account_cpu_user_exit();
+
+#ifdef CONFIG_PPC_BOOK3S_64 /* BOOK3E and ppc32 not using this */
+       /*
+        * We do this at the end so that we do context switch with KERNEL AMR
+        */
+       kuap_user_restore(regs);
+#endif
+       return ret;
+}
+
+#ifndef CONFIG_PPC_BOOK3E_64 /* BOOK3E not yet using this */
+notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned long msr)
+{
+       unsigned long ti_flags;
+       unsigned long flags;
+       unsigned long ret = 0;
+
+       if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x))
+               BUG_ON(!(regs->msr & MSR_RI));
+       BUG_ON(!(regs->msr & MSR_PR));
+       BUG_ON(!FULL_REGS(regs));
+       BUG_ON(arch_irq_disabled_regs(regs));
+       CT_WARN_ON(ct_state() == CONTEXT_USER);
+
+       /*
+        * We don't need to restore AMR on the way back to userspace for KUAP.
+        * AMR can only have been unlocked if we interrupted the kernel.
+        */
+#ifdef CONFIG_PPC64
+       kuap_check_amr();
+#endif
+
+       local_irq_save(flags);
+
+again:
+       ti_flags = READ_ONCE(current_thread_info()->flags);
+       while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
+               local_irq_enable(); /* returning to user: may enable */
+               if (ti_flags & _TIF_NEED_RESCHED) {
+                       schedule();
+               } else {
+                       if (ti_flags & _TIF_SIGPENDING)
+                               ret |= _TIF_RESTOREALL;
+                       do_notify_resume(regs, ti_flags);
+               }
+               local_irq_disable();
+               ti_flags = READ_ONCE(current_thread_info()->flags);
+       }
+
+       if (IS_ENABLED(CONFIG_PPC_BOOK3S) && IS_ENABLED(CONFIG_PPC_FPU)) {
+               if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
+                               unlikely((ti_flags & _TIF_RESTORE_TM))) {
+                       restore_tm_state(regs);
+               } else {
+                       unsigned long mathflags = MSR_FP;
+
+                       if (cpu_has_feature(CPU_FTR_VSX))
+                               mathflags |= MSR_VEC | MSR_VSX;
+                       else if (cpu_has_feature(CPU_FTR_ALTIVEC))
+                               mathflags |= MSR_VEC;
+
+                       /* See above restore_math comment */
+                       if ((regs->msr & mathflags) != mathflags)
+                               restore_math(regs);
+               }
+       }
+
+       user_enter_irqoff();
+
+       if (unlikely(!__prep_irq_for_enabled_exit(true))) {
+               user_exit_irqoff();
+               local_irq_enable();
+               local_irq_disable();
+               goto again;
+       }
+
+       booke_load_dbcr0();
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       local_paca->tm_scratch = regs->msr;
+#endif
+
+       account_cpu_user_exit();
+
+       /*
+        * We do this at the end so that we do context switch with KERNEL AMR
+        */
+#ifdef CONFIG_PPC64
+       kuap_user_restore(regs);
+#endif
+       return ret;
+}
+
+void unrecoverable_exception(struct pt_regs *regs);
+void preempt_schedule_irq(void);
+
+notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsigned long msr)
+{
+       unsigned long flags;
+       unsigned long ret = 0;
+#ifdef CONFIG_PPC64
+       unsigned long amr;
+#endif
+
+       if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x) &&
+           unlikely(!(regs->msr & MSR_RI)))
+               unrecoverable_exception(regs);
+       BUG_ON(regs->msr & MSR_PR);
+       BUG_ON(!FULL_REGS(regs));
+       /*
+        * CT_WARN_ON comes here via program_check_exception,
+        * so avoid recursion.
+        */
+       if (TRAP(regs) != 0x700)
+               CT_WARN_ON(ct_state() == CONTEXT_USER);
+
+#ifdef CONFIG_PPC64
+       amr = kuap_get_and_check_amr();
+#endif
+
+       if (unlikely(current_thread_info()->flags & _TIF_EMULATE_STACK_STORE)) {
+               clear_bits(_TIF_EMULATE_STACK_STORE, &current_thread_info()->flags);
+               ret = 1;
+       }
+
+       local_irq_save(flags);
+
+       if (!arch_irq_disabled_regs(regs)) {
+               /* Returning to a kernel context with local irqs enabled. */
+               WARN_ON_ONCE(!(regs->msr & MSR_EE));
+again:
+               if (IS_ENABLED(CONFIG_PREEMPT)) {
+                       /* Return to preemptible kernel context */
+                       if (unlikely(current_thread_info()->flags & _TIF_NEED_RESCHED)) {
+                               if (preempt_count() == 0)
+                                       preempt_schedule_irq();
+                       }
+               }
+
+               if (unlikely(!prep_irq_for_enabled_exit(true, !irqs_disabled_flags(flags))))
+                       goto again;
+       } else {
+               /* Returning to a kernel context with local irqs disabled. */
+               __hard_EE_RI_disable();
+#ifdef CONFIG_PPC64
+               if (regs->msr & MSR_EE)
+                       local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
+#endif
+       }
+
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       local_paca->tm_scratch = regs->msr;
+#endif
+
+       /*
+        * Don't want to mfspr(SPRN_AMR) here, because this comes after mtmsr,
+        * which would cause Read-After-Write stalls. Hence, we take the AMR
+        * value from the check above.
+        */
+#ifdef CONFIG_PPC64
+       kuap_kernel_restore(regs, amr);
+#endif
+
+       return ret;
+}
+#endif
index 5b69a6a..c00214a 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/pci.h>
 #include <linux/iommu.h>
 #include <linux/sched.h>
+#include <linux/debugfs.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/iommu.h>
 
 #define DBG(...)
 
+#ifdef CONFIG_IOMMU_DEBUGFS
+static int iommu_debugfs_weight_get(void *data, u64 *val)
+{
+       struct iommu_table *tbl = data;
+       *val = bitmap_weight(tbl->it_map, tbl->it_size);
+       return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(iommu_debugfs_fops_weight, iommu_debugfs_weight_get, NULL, "%llu\n");
+
+static void iommu_debugfs_add(struct iommu_table *tbl)
+{
+       char name[10];
+       struct dentry *liobn_entry;
+
+       sprintf(name, "%08lx", tbl->it_index);
+       liobn_entry = debugfs_create_dir(name, iommu_debugfs_dir);
+
+       debugfs_create_file_unsafe("weight", 0400, liobn_entry, tbl, &iommu_debugfs_fops_weight);
+       debugfs_create_ulong("it_size", 0400, liobn_entry, &tbl->it_size);
+       debugfs_create_ulong("it_page_shift", 0400, liobn_entry, &tbl->it_page_shift);
+       debugfs_create_ulong("it_reserved_start", 0400, liobn_entry, &tbl->it_reserved_start);
+       debugfs_create_ulong("it_reserved_end", 0400, liobn_entry, &tbl->it_reserved_end);
+       debugfs_create_ulong("it_indirect_levels", 0400, liobn_entry, &tbl->it_indirect_levels);
+       debugfs_create_ulong("it_level_size", 0400, liobn_entry, &tbl->it_level_size);
+}
+
+static void iommu_debugfs_del(struct iommu_table *tbl)
+{
+       char name[10];
+       struct dentry *liobn_entry;
+
+       sprintf(name, "%08lx", tbl->it_index);
+       liobn_entry = debugfs_lookup(name, iommu_debugfs_dir);
+       if (liobn_entry)
+               debugfs_remove(liobn_entry);
+}
+#else
+static void iommu_debugfs_add(struct iommu_table *tbl){}
+static void iommu_debugfs_del(struct iommu_table *tbl){}
+#endif
+
 static int novmerge;
 
 static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
@@ -725,6 +767,8 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
                welcomed = 1;
        }
 
+       iommu_debugfs_add(tbl);
+
        return tbl;
 }
 
@@ -744,6 +788,8 @@ static void iommu_table_free(struct kref *kref)
                return;
        }
 
+       iommu_debugfs_del(tbl);
+
        iommu_table_release_pages(tbl);
 
        /* verify that table contains no entries */
index cc7a627..086b0a7 100644 (file)
@@ -54,6 +54,7 @@
 #include <linux/pgtable.h>
 
 #include <linux/uaccess.h>
+#include <asm/interrupt.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/cache.h>
@@ -269,6 +270,31 @@ again:
        }
 }
 
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_KUAP)
+static inline void replay_soft_interrupts_irqrestore(void)
+{
+       unsigned long kuap_state = get_kuap();
+
+       /*
+        * Check if anything calls local_irq_enable/restore() when KUAP is
+        * disabled (user access enabled). We handle that case here by saving
+        * and re-locking AMR but we shouldn't get here in the first place,
+        * hence the warning.
+        */
+       kuap_check_amr();
+
+       if (kuap_state != AMR_KUAP_BLOCKED)
+               set_kuap(AMR_KUAP_BLOCKED);
+
+       replay_soft_interrupts();
+
+       if (kuap_state != AMR_KUAP_BLOCKED)
+               set_kuap(kuap_state);
+}
+#else
+#define replay_soft_interrupts_irqrestore() replay_soft_interrupts()
+#endif
+
 notrace void arch_local_irq_restore(unsigned long mask)
 {
        unsigned char irq_happened;
@@ -332,7 +358,7 @@ notrace void arch_local_irq_restore(unsigned long mask)
        irq_soft_mask_set(IRQS_ALL_DISABLED);
        trace_hardirqs_off();
 
-       replay_soft_interrupts();
+       replay_soft_interrupts_irqrestore();
        local_paca->irq_happened = 0;
 
        trace_hardirqs_on();
@@ -644,8 +670,6 @@ void __do_irq(struct pt_regs *regs)
 {
        unsigned int irq;
 
-       irq_enter();
-
        trace_irq_entry(regs);
 
        /*
@@ -665,11 +689,9 @@ void __do_irq(struct pt_regs *regs)
                generic_handle_irq(irq);
 
        trace_irq_exit(regs);
-
-       irq_exit();
 }
 
-void do_IRQ(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER_ASYNC(do_IRQ)
 {
        struct pt_regs *old_regs = set_irq_regs(regs);
        void *cursp, *irqsp, *sirqsp;
index 9f3e133..11f0cae 100644 (file)
 #include <linux/irq_work.h>
 #include <linux/extable.h>
 #include <linux/ftrace.h>
+#include <linux/memblock.h>
 
+#include <asm/interrupt.h>
 #include <asm/machdep.h>
 #include <asm/mce.h>
 #include <asm/nmi.h>
+#include <asm/asm-prototypes.h>
 
-static DEFINE_PER_CPU(int, mce_nest_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
-
-/* Queue for delayed MCE events. */
-static DEFINE_PER_CPU(int, mce_queue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
-
-/* Queue for delayed MCE UE events. */
-static DEFINE_PER_CPU(int, mce_ue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
-                                       mce_ue_event_queue);
+#include "setup.h"
 
 static void machine_check_process_queued_event(struct irq_work *work);
 static void machine_check_ue_irq_work(struct irq_work *work);
@@ -103,9 +96,10 @@ void save_mce_event(struct pt_regs *regs, long handled,
                    struct mce_error_info *mce_err,
                    uint64_t nip, uint64_t addr, uint64_t phys_addr)
 {
-       int index = __this_cpu_inc_return(mce_nest_count) - 1;
-       struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
+       int index = local_paca->mce_info->mce_nest_count++;
+       struct machine_check_event *mce;
 
+       mce = &local_paca->mce_info->mce_event[index];
        /*
         * Return if we don't have enough space to log mce event.
         * mce_nest_count may go beyond MAX_MC_EVT but that's ok,
@@ -191,7 +185,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
  */
 int get_mce_event(struct machine_check_event *mce, bool release)
 {
-       int index = __this_cpu_read(mce_nest_count) - 1;
+       int index = local_paca->mce_info->mce_nest_count - 1;
        struct machine_check_event *mc_evt;
        int ret = 0;
 
@@ -201,7 +195,7 @@ int get_mce_event(struct machine_check_event *mce, bool release)
 
        /* Check if we have MCE info to process. */
        if (index < MAX_MC_EVT) {
-               mc_evt = this_cpu_ptr(&mce_event[index]);
+               mc_evt = &local_paca->mce_info->mce_event[index];
                /* Copy the event structure and release the original */
                if (mce)
                        *mce = *mc_evt;
@@ -211,7 +205,7 @@ int get_mce_event(struct machine_check_event *mce, bool release)
        }
        /* Decrement the count to free the slot. */
        if (release)
-               __this_cpu_dec(mce_nest_count);
+               local_paca->mce_info->mce_nest_count--;
 
        return ret;
 }
@@ -233,13 +227,14 @@ static void machine_check_ue_event(struct machine_check_event *evt)
 {
        int index;
 
-       index = __this_cpu_inc_return(mce_ue_count) - 1;
+       index = local_paca->mce_info->mce_ue_count++;
        /* If queue is full, just return for now. */
        if (index >= MAX_MC_EVT) {
-               __this_cpu_dec(mce_ue_count);
+               local_paca->mce_info->mce_ue_count--;
                return;
        }
-       memcpy(this_cpu_ptr(&mce_ue_event_queue[index]), evt, sizeof(*evt));
+       memcpy(&local_paca->mce_info->mce_ue_event_queue[index],
+              evt, sizeof(*evt));
 
        /* Queue work to process this event later. */
        irq_work_queue(&mce_ue_event_irq_work);
@@ -256,13 +251,14 @@ void machine_check_queue_event(void)
        if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
                return;
 
-       index = __this_cpu_inc_return(mce_queue_count) - 1;
+       index = local_paca->mce_info->mce_queue_count++;
        /* If queue is full, just return for now. */
        if (index >= MAX_MC_EVT) {
-               __this_cpu_dec(mce_queue_count);
+               local_paca->mce_info->mce_queue_count--;
                return;
        }
-       memcpy(this_cpu_ptr(&mce_event_queue[index]), &evt, sizeof(evt));
+       memcpy(&local_paca->mce_info->mce_event_queue[index],
+              &evt, sizeof(evt));
 
        /* Queue irq work to process this event later. */
        irq_work_queue(&mce_event_process_work);
@@ -289,9 +285,9 @@ static void machine_process_ue_event(struct work_struct *work)
        int index;
        struct machine_check_event *evt;
 
-       while (__this_cpu_read(mce_ue_count) > 0) {
-               index = __this_cpu_read(mce_ue_count) - 1;
-               evt = this_cpu_ptr(&mce_ue_event_queue[index]);
+       while (local_paca->mce_info->mce_ue_count > 0) {
+               index = local_paca->mce_info->mce_ue_count - 1;
+               evt = &local_paca->mce_info->mce_ue_event_queue[index];
                blocking_notifier_call_chain(&mce_notifier_list, 0, evt);
 #ifdef CONFIG_MEMORY_FAILURE
                /*
@@ -304,7 +300,7 @@ static void machine_process_ue_event(struct work_struct *work)
                 */
                if (evt->error_type == MCE_ERROR_TYPE_UE) {
                        if (evt->u.ue_error.ignore_event) {
-                               __this_cpu_dec(mce_ue_count);
+                               local_paca->mce_info->mce_ue_count--;
                                continue;
                        }
 
@@ -320,7 +316,7 @@ static void machine_process_ue_event(struct work_struct *work)
                                        "was generated\n");
                }
 #endif
-               __this_cpu_dec(mce_ue_count);
+               local_paca->mce_info->mce_ue_count--;
        }
 }
 /*
@@ -338,17 +334,17 @@ static void machine_check_process_queued_event(struct irq_work *work)
         * For now just print it to console.
         * TODO: log this error event to FSP or nvram.
         */
-       while (__this_cpu_read(mce_queue_count) > 0) {
-               index = __this_cpu_read(mce_queue_count) - 1;
-               evt = this_cpu_ptr(&mce_event_queue[index]);
+       while (local_paca->mce_info->mce_queue_count > 0) {
+               index = local_paca->mce_info->mce_queue_count - 1;
+               evt = &local_paca->mce_info->mce_event_queue[index];
 
                if (evt->error_type == MCE_ERROR_TYPE_UE &&
                    evt->u.ue_error.ignore_event) {
-                       __this_cpu_dec(mce_queue_count);
+                       local_paca->mce_info->mce_queue_count--;
                        continue;
                }
                machine_check_print_event_info(evt, false, false);
-               __this_cpu_dec(mce_queue_count);
+               local_paca->mce_info->mce_queue_count--;
        }
 }
 
@@ -588,15 +584,9 @@ EXPORT_SYMBOL_GPL(machine_check_print_event_info);
  *
  * regs->nip and regs->msr contains srr0 and ssr1.
  */
-long notrace machine_check_early(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER_NMI(machine_check_early)
 {
        long handled = 0;
-       u8 ftrace_enabled = this_cpu_get_ftrace_enabled();
-
-       this_cpu_set_ftrace_enabled(0);
-       /* Do not use nmi_enter/exit for pseries hpte guest */
-       if (radix_enabled() || !firmware_has_feature(FW_FEATURE_LPAR))
-               nmi_enter();
 
        hv_nmi_check_nonrecoverable(regs);
 
@@ -606,11 +596,6 @@ long notrace machine_check_early(struct pt_regs *regs)
        if (ppc_md.machine_check_early)
                handled = ppc_md.machine_check_early(regs);
 
-       if (radix_enabled() || !firmware_has_feature(FW_FEATURE_LPAR))
-               nmi_exit();
-
-       this_cpu_set_ftrace_enabled(ftrace_enabled);
-
        return handled;
 }
 
@@ -722,7 +707,7 @@ long hmi_handle_debugtrig(struct pt_regs *regs)
 /*
  * Return values:
  */
-long hmi_exception_realmode(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER_NMI(hmi_exception_realmode)
 {      
        int ret;
 
@@ -741,3 +726,24 @@ long hmi_exception_realmode(struct pt_regs *regs)
 
        return 1;
 }
+
+void __init mce_init(void)
+{
+       struct mce_info *mce_info;
+       u64 limit;
+       int i;
+
+       limit = min(ppc64_bolted_size(), ppc64_rma_size);
+       for_each_possible_cpu(i) {
+               mce_info = memblock_alloc_try_nid(sizeof(*mce_info),
+                                                 __alignof__(*mce_info),
+                                                 MEMBLOCK_LOW_LIMIT,
+                                                 limit, cpu_to_node(i));
+               if (!mce_info)
+                       goto err;
+               paca_ptrs[i]->mce_info = mce_info;
+       }
+       return;
+err:
+       panic("Failed to allocate memory for MCE event data\n");
+}
index 69bfe96..7f7cdbe 100644 (file)
@@ -141,30 +141,11 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
        }
 }
 
-/*
- * emulate_step() requires insn to be emulated as
- * second parameter. Load register 'r4' with the
- * instruction.
- */
-void patch_imm32_load_insns(unsigned int val, kprobe_opcode_t *addr)
-{
-       /* addis r4,0,(insn)@h */
-       patch_instruction((struct ppc_inst *)addr,
-                         ppc_inst(PPC_INST_ADDIS | ___PPC_RT(4) |
-                                  ((val >> 16) & 0xffff)));
-       addr++;
-
-       /* ori r4,r4,(insn)@l */
-       patch_instruction((struct ppc_inst *)addr,
-                         ppc_inst(PPC_INST_ORI | ___PPC_RA(4) |
-                                  ___PPC_RS(4) | (val & 0xffff)));
-}
-
 /*
  * Generate instructions to load provided immediate 64-bit value
  * to register 'reg' and patch these instructions at 'addr'.
  */
-void patch_imm64_load_insns(unsigned long val, int reg, kprobe_opcode_t *addr)
+static void patch_imm64_load_insns(unsigned long val, int reg, kprobe_opcode_t *addr)
 {
        /* lis reg,(op)@highest */
        patch_instruction((struct ppc_inst *)addr,
index 2b55599..001e90c 100644 (file)
@@ -1699,3 +1699,13 @@ static void fixup_hide_host_resource_fsl(struct pci_dev *dev)
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MOTOROLA, PCI_ANY_ID, fixup_hide_host_resource_fsl);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_FREESCALE, PCI_ANY_ID, fixup_hide_host_resource_fsl);
+
+
+static int __init discover_phbs(void)
+{
+       if (ppc_md.discover_phbs)
+               ppc_md.discover_phbs();
+
+       return 0;
+}
+core_initcall(discover_phbs);
index e99b7c5..61571ae 100644 (file)
@@ -443,46 +443,6 @@ void *pci_traverse_device_nodes(struct device_node *start,
 }
 EXPORT_SYMBOL_GPL(pci_traverse_device_nodes);
 
-static struct pci_dn *pci_dn_next_one(struct pci_dn *root,
-                                     struct pci_dn *pdn)
-{
-       struct list_head *next = pdn->child_list.next;
-
-       if (next != &pdn->child_list)
-               return list_entry(next, struct pci_dn, list);
-
-       while (1) {
-               if (pdn == root)
-                       return NULL;
-
-               next = pdn->list.next;
-               if (next != &pdn->parent->child_list)
-                       break;
-
-               pdn = pdn->parent;
-       }
-
-       return list_entry(next, struct pci_dn, list);
-}
-
-void *traverse_pci_dn(struct pci_dn *root,
-                     void *(*fn)(struct pci_dn *, void *),
-                     void *data)
-{
-       struct pci_dn *pdn = root;
-       void *ret;
-
-       /* Only scan the child nodes */
-       for (pdn = pci_dn_next_one(root, pdn); pdn;
-            pdn = pci_dn_next_one(root, pdn)) {
-               ret = fn(pdn, data);
-               if (ret)
-                       return ret;
-       }
-
-       return NULL;
-}
-
 static void *add_pdn(struct device_node *dn, void *data)
 {
        struct pci_controller *hose = data;
@@ -521,28 +481,6 @@ void pci_devs_phb_init_dynamic(struct pci_controller *phb)
        pci_traverse_device_nodes(dn, add_pdn, phb);
 }
 
-/** 
- * pci_devs_phb_init - Initialize phbs and pci devs under them.
- * 
- * This routine walks over all phb's (pci-host bridges) on the
- * system, and sets up assorted pci-related structures 
- * (including pci info in the device node structs) for each
- * pci device found underneath.  This routine runs once,
- * early in the boot sequence.
- */
-static int __init pci_devs_phb_init(void)
-{
-       struct pci_controller *phb, *tmp;
-
-       /* This must be done first so the device nodes have valid pci info! */
-       list_for_each_entry_safe(phb, tmp, &hose_list, list_node)
-               pci_devs_phb_init_dynamic(phb);
-
-       return 0;
-}
-
-core_initcall(pci_devs_phb_init);
-
 static void pci_dev_pdn_setup(struct pci_dev *pdev)
 {
        struct pci_dn *pdn;
index a66f435..924d023 100644 (file)
@@ -41,6 +41,7 @@
 #include <linux/pkeys.h>
 #include <linux/seq_buf.h>
 
+#include <asm/interrupt.h>
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/mmu.h>
@@ -659,11 +660,10 @@ static void do_break_handler(struct pt_regs *regs)
        }
 }
 
-void do_break (struct pt_regs *regs, unsigned long address,
-                   unsigned long error_code)
+DEFINE_INTERRUPT_HANDLER(do_break)
 {
        current->thread.trap_nr = TRAP_HWBKPT;
-       if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
+       if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, regs->dsisr,
                        11, SIGSEGV) == NOTIFY_STOP)
                return;
 
@@ -681,7 +681,7 @@ void do_break (struct pt_regs *regs, unsigned long address,
                do_break_handler(regs);
 
        /* Deliver the signal to userspace */
-       force_sig_fault(SIGTRAP, TRAP_HWBKPT, (void __user *)address);
+       force_sig_fault(SIGTRAP, TRAP_HWBKPT, (void __user *)regs->dar);
 }
 #endif /* CONFIG_PPC_ADV_DEBUG_REGS */
 
@@ -2047,6 +2047,9 @@ static inline int valid_emergency_stack(unsigned long sp, struct task_struct *p,
        unsigned long stack_page;
        unsigned long cpu = task_cpu(p);
 
+       if (!paca_ptrs)
+               return 0;
+
        stack_page = (unsigned long)paca_ptrs[cpu]->emergency_sp - THREAD_SIZE;
        if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes)
                return 1;
@@ -2176,7 +2179,7 @@ void show_stack(struct task_struct *tsk, unsigned long *stack,
                 * See if this is an exception frame.
                 * We look for the "regshere" marker in the current frame.
                 */
-               if (validate_sp(sp, tsk, STACK_INT_FRAME_SIZE)
+               if (validate_sp(sp, tsk, STACK_FRAME_WITH_PT_REGS)
                    && stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
                        struct pt_regs *regs = (struct pt_regs *)
                                (sp + STACK_FRAME_OVERHEAD);
index ae3c417..9a4797d 100644 (file)
@@ -707,7 +707,7 @@ static void __init save_fscr_to_task(void)
                init_task.thread.fscr = mfspr(SPRN_FSCR);
 }
 #else
-static inline void save_fscr_to_task(void) {};
+static inline void save_fscr_to_task(void) {}
 #endif
 
 
index e9d4eb6..ccf77b9 100644 (file)
@@ -1331,14 +1331,10 @@ static void __init prom_check_platform_support(void)
                if (prop_len > sizeof(vec))
                        prom_printf("WARNING: ibm,arch-vec-5-platform-support longer than expected (len: %d)\n",
                                    prop_len);
-               prom_getprop(prom.chosen, "ibm,arch-vec-5-platform-support",
-                            &vec, sizeof(vec));
-               for (i = 0; i < sizeof(vec); i += 2) {
-                       prom_debug("%d: index = 0x%x val = 0x%x\n", i / 2
-                                                                 , vec[i]
-                                                                 , vec[i + 1]);
-                       prom_parse_platform_support(vec[i], vec[i + 1],
-                                                   &supported);
+               prom_getprop(prom.chosen, "ibm,arch-vec-5-platform-support", &vec, sizeof(vec));
+               for (i = 0; i < prop_len; i += 2) {
+                       prom_debug("%d: index = 0x%x val = 0x%x\n", i / 2, vec[i], vec[i + 1]);
+                       prom_parse_platform_support(vec[i], vec[i + 1], &supported);
                }
        }
 
index 3d44b73..4f3d4ff 100644 (file)
@@ -262,8 +262,6 @@ long do_syscall_trace_enter(struct pt_regs *regs)
 {
        u32 flags;
 
-       user_exit();
-
        flags = READ_ONCE(current_thread_info()->flags) &
                (_TIF_SYSCALL_EMU | _TIF_SYSCALL_TRACE);
 
@@ -340,8 +338,6 @@ void do_syscall_trace_leave(struct pt_regs *regs)
        step = test_thread_flag(TIF_SINGLESTEP);
        if (step || test_thread_flag(TIF_SYSCALL_TRACE))
                tracehook_report_syscall_exit(regs, step);
-
-       user_enter();
 }
 
 void __init pt_regs_check(void);
index 71f38e9..bee984b 100644 (file)
@@ -64,6 +64,7 @@
 #include <asm/mmu_context.h>
 #include <asm/cpu_has_feature.h>
 #include <asm/kasan.h>
+#include <asm/mce.h>
 
 #include "setup.h"
 
@@ -237,18 +238,17 @@ static int show_cpuinfo(struct seq_file *m, void *v)
        maj = (pvr >> 8) & 0xFF;
        min = pvr & 0xFF;
 
-       seq_printf(m, "processor\t: %lu\n", cpu_id);
-       seq_printf(m, "cpu\t\t: ");
+       seq_printf(m, "processor\t: %lu\ncpu\t\t: ", cpu_id);
 
        if (cur_cpu_spec->pvr_mask && cur_cpu_spec->cpu_name)
-               seq_printf(m, "%s", cur_cpu_spec->cpu_name);
+               seq_puts(m, cur_cpu_spec->cpu_name);
        else
                seq_printf(m, "unknown (%08x)", pvr);
 
        if (cpu_has_feature(CPU_FTR_ALTIVEC))
-               seq_printf(m, ", altivec supported");
+               seq_puts(m, ", altivec supported");
 
-       seq_printf(m, "\n");
+       seq_putc(m, '\n');
 
 #ifdef CONFIG_TAU
        if (cpu_has_feature(CPU_FTR_TAU)) {
@@ -327,7 +327,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
                seq_printf(m, "bogomips\t: %lu.%02lu\n", loops_per_jiffy / (500000 / HZ),
                           (loops_per_jiffy / (5000 / HZ)) % 100);
 
-       seq_printf(m, "\n");
+       seq_putc(m, '\n');
 
        /* If this is the last cpu, print the summary */
        if (cpumask_next(cpu_id, cpu_online_mask) >= nr_cpu_ids)
@@ -938,6 +938,7 @@ void __init setup_arch(char **cmdline_p)
        exc_lvl_early_init();
        emergency_stack_init();
 
+       mce_init();
        smp_release_cpus();
 
        initmem_init();
index 2dd0d9c..84058bb 100644 (file)
@@ -14,31 +14,31 @@ void irqstack_early_init(void);
 #ifdef CONFIG_PPC32
 void setup_power_save(void);
 #else
-static inline void setup_power_save(void) { };
+static inline void setup_power_save(void) { }
 #endif
 
 #if defined(CONFIG_PPC64) && defined(CONFIG_SMP)
 void check_smt_enabled(void);
 #else
-static inline void check_smt_enabled(void) { };
+static inline void check_smt_enabled(void) { }
 #endif
 
 #if defined(CONFIG_PPC_BOOK3E) && defined(CONFIG_SMP)
 void setup_tlb_core_data(void);
 #else
-static inline void setup_tlb_core_data(void) { };
+static inline void setup_tlb_core_data(void) { }
 #endif
 
 #if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_BOOKE) || defined(CONFIG_40x)
 void exc_lvl_early_init(void);
 #else
-static inline void exc_lvl_early_init(void) { };
+static inline void exc_lvl_early_init(void) { }
 #endif
 
 #if defined(CONFIG_PPC64) || defined(CONFIG_VMAP_STACK)
 void emergency_stack_init(void);
 #else
-static inline void emergency_stack_init(void) { };
+static inline void emergency_stack_init(void) { }
 #endif
 
 #ifdef CONFIG_PPC64
@@ -55,7 +55,7 @@ extern unsigned long spr_default_dscr;
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 void kvm_cma_reserve(void);
 #else
-static inline void kvm_cma_reserve(void) { };
+static inline void kvm_cma_reserve(void) { }
 #endif
 
 #ifdef CONFIG_TAU
index c28e949..560ed8b 100644 (file)
@@ -67,6 +67,7 @@
 #include <asm/kup.h>
 #include <asm/early_ioremap.h>
 #include <asm/pgalloc.h>
+#include <asm/asm-prototypes.h>
 
 #include "setup.h"
 
@@ -258,7 +259,7 @@ static void cpu_ready_for_interrupts(void)
 
 unsigned long spr_default_dscr = 0;
 
-void __init record_spr_defaults(void)
+static void __init record_spr_defaults(void)
 {
        if (early_cpu_has_feature(CPU_FTR_DSCR))
                spr_default_dscr = mfspr(SPRN_DSCR);
@@ -1008,7 +1009,7 @@ void rfi_flush_enable(bool enable)
        rfi_flush = enable;
 }
 
-void entry_flush_enable(bool enable)
+static void entry_flush_enable(bool enable)
 {
        if (enable) {
                do_entry_flush_fixups(enabled_flush_types);
@@ -1020,7 +1021,7 @@ void entry_flush_enable(bool enable)
        entry_flush = enable;
 }
 
-void uaccess_flush_enable(bool enable)
+static void uaccess_flush_enable(bool enable)
 {
        if (enable) {
                do_uaccess_flush_fixups(enabled_flush_types);
index 53782aa..9ded046 100644 (file)
@@ -282,8 +282,6 @@ static void do_signal(struct task_struct *tsk)
 
 void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
 {
-       user_exit();
-
        if (thread_info_flags & _TIF_UPROBE)
                uprobe_notify_resume(regs);
 
@@ -299,8 +297,6 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
                tracehook_notify_resume(regs);
                rseq_handle_notify_resume(NULL, regs);
        }
-
-       user_enter();
 }
 
 static unsigned long get_tm_stackpointer(struct task_struct *tsk)
index 934cbdf..75ee918 100644 (file)
@@ -929,8 +929,9 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
        regs->gpr[3] = ksig->sig;
        regs->gpr[4] = (unsigned long) sc;
        regs->nip = (unsigned long)ksig->ka.sa.sa_handler;
-       /* enter the signal handler in big-endian mode */
+       /* enter the signal handler in native-endian mode */
        regs->msr &= ~MSR_LE;
+       regs->msr |= (MSR_KERNEL & MSR_LE);
        return 0;
 
 failed:
index 9e2246e..5a4d59a 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/random.h>
 #include <linux/stackprotector.h>
 #include <linux/pgtable.h>
+#include <linux/clockchips.h>
 
 #include <asm/ptrace.h>
 #include <linux/atomic.h>
@@ -576,7 +577,7 @@ void tick_broadcast(const struct cpumask *mask)
 #endif
 
 #ifdef CONFIG_DEBUGGER
-void debugger_ipi_callback(struct pt_regs *regs)
+static void debugger_ipi_callback(struct pt_regs *regs)
 {
        debugger_ipi(regs);
 }
index d36c639..16ff039 100644 (file)
@@ -59,57 +59,64 @@ unsigned long compat_sys_mmap2(unsigned long addr, size_t len,
 /* 
  * long long munging:
  * The 32 bit ABI passes long longs in an odd even register pair.
+ * High and low parts are swapped depending on endian mode,
+ * so define a macro (similar to mips linux32) to handle that.
  */
+#ifdef __LITTLE_ENDIAN__
+#define merge_64(low, high) ((u64)high << 32) | low
+#else
+#define merge_64(high, low) ((u64)high << 32) | low
+#endif
 
 compat_ssize_t compat_sys_pread64(unsigned int fd, char __user *ubuf, compat_size_t count,
-                            u32 reg6, u32 poshi, u32 poslo)
+                            u32 reg6, u32 pos1, u32 pos2)
 {
-       return ksys_pread64(fd, ubuf, count, ((loff_t)poshi << 32) | poslo);
+       return ksys_pread64(fd, ubuf, count, merge_64(pos1, pos2));
 }
 
 compat_ssize_t compat_sys_pwrite64(unsigned int fd, const char __user *ubuf, compat_size_t count,
-                             u32 reg6, u32 poshi, u32 poslo)
+                             u32 reg6, u32 pos1, u32 pos2)
 {
-       return ksys_pwrite64(fd, ubuf, count, ((loff_t)poshi << 32) | poslo);
+       return ksys_pwrite64(fd, ubuf, count, merge_64(pos1, pos2));
 }
 
-compat_ssize_t compat_sys_readahead(int fd, u32 r4, u32 offhi, u32 offlo, u32 count)
+compat_ssize_t compat_sys_readahead(int fd, u32 r4, u32 offset1, u32 offset2, u32 count)
 {
-       return ksys_readahead(fd, ((loff_t)offhi << 32) | offlo, count);
+       return ksys_readahead(fd, merge_64(offset1, offset2), count);
 }
 
 asmlinkage int compat_sys_truncate64(const char __user * path, u32 reg4,
-                               unsigned long high, unsigned long low)
+                               unsigned long len1, unsigned long len2)
 {
-       return ksys_truncate(path, (high << 32) | low);
+       return ksys_truncate(path, merge_64(len1, len2));
 }
 
-asmlinkage long compat_sys_fallocate(int fd, int mode, u32 offhi, u32 offlo,
-                                    u32 lenhi, u32 lenlo)
+asmlinkage long compat_sys_fallocate(int fd, int mode, u32 offset1, u32 offset2,
+                                    u32 len1, u32 len2)
 {
-       return ksys_fallocate(fd, mode, ((loff_t)offhi << 32) | offlo,
-                            ((loff_t)lenhi << 32) | lenlo);
+       return ksys_fallocate(fd, mode, ((loff_t)offset1 << 32) | offset2,
+                            merge_64(len1, len2));
 }
 
-asmlinkage int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long high,
-                                unsigned long low)
+asmlinkage int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long len1,
+                                unsigned long len2)
 {
-       return ksys_ftruncate(fd, (high << 32) | low);
+       return ksys_ftruncate(fd, merge_64(len1, len2));
 }
 
-long ppc32_fadvise64(int fd, u32 unused, u32 offset_high, u32 offset_low,
+long ppc32_fadvise64(int fd, u32 unused, u32 offset1, u32 offset2,
                     size_t len, int advice)
 {
-       return ksys_fadvise64_64(fd, (u64)offset_high << 32 | offset_low, len,
+       return ksys_fadvise64_64(fd, merge_64(offset1, offset2), len,
                                 advice);
 }
 
 asmlinkage long compat_sys_sync_file_range2(int fd, unsigned int flags,
-                                  unsigned offset_hi, unsigned offset_lo,
-                                  unsigned nbytes_hi, unsigned nbytes_lo)
+                                  unsigned offset1, unsigned offset2,
+                                  unsigned nbytes1, unsigned nbytes2)
 {
-       loff_t offset = ((loff_t)offset_hi << 32) | offset_lo;
-       loff_t nbytes = ((loff_t)nbytes_hi << 32) | nbytes_lo;
+       loff_t offset = merge_64(offset1, offset2);
+       loff_t nbytes = merge_64(nbytes1, nbytes2);
 
        return ksys_sync_file_range(fd, offset, nbytes, flags);
 }
diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c
deleted file mode 100644 (file)
index 7c85ed0..0000000
+++ /dev/null
@@ -1,441 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#include <linux/err.h>
-#include <asm/asm-prototypes.h>
-#include <asm/kup.h>
-#include <asm/cputime.h>
-#include <asm/hw_irq.h>
-#include <asm/kprobes.h>
-#include <asm/paca.h>
-#include <asm/ptrace.h>
-#include <asm/reg.h>
-#include <asm/signal.h>
-#include <asm/switch_to.h>
-#include <asm/syscall.h>
-#include <asm/time.h>
-#include <asm/unistd.h>
-
-typedef long (*syscall_fn)(long, long, long, long, long, long);
-
-/* Has to run notrace because it is entered not completely "reconciled" */
-notrace long system_call_exception(long r3, long r4, long r5,
-                                  long r6, long r7, long r8,
-                                  unsigned long r0, struct pt_regs *regs)
-{
-       syscall_fn f;
-
-       if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
-               BUG_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED);
-
-       trace_hardirqs_off(); /* finish reconciling */
-
-       if (IS_ENABLED(CONFIG_PPC_BOOK3S))
-               BUG_ON(!(regs->msr & MSR_RI));
-       BUG_ON(!(regs->msr & MSR_PR));
-       BUG_ON(!FULL_REGS(regs));
-       BUG_ON(regs->softe != IRQS_ENABLED);
-
-#ifdef CONFIG_PPC_PKEY
-       if (mmu_has_feature(MMU_FTR_PKEY)) {
-               unsigned long amr, iamr;
-               bool flush_needed = false;
-               /*
-                * When entering from userspace we mostly have the AMR/IAMR
-                * different from kernel default values. Hence don't compare.
-                */
-               amr = mfspr(SPRN_AMR);
-               iamr = mfspr(SPRN_IAMR);
-               regs->amr  = amr;
-               regs->iamr = iamr;
-               if (mmu_has_feature(MMU_FTR_BOOK3S_KUAP)) {
-                       mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
-                       flush_needed = true;
-               }
-               if (mmu_has_feature(MMU_FTR_BOOK3S_KUEP)) {
-                       mtspr(SPRN_IAMR, AMR_KUEP_BLOCKED);
-                       flush_needed = true;
-               }
-               if (flush_needed)
-                       isync();
-       } else
-#endif
-               kuap_check_amr();
-
-       account_cpu_user_entry();
-
-#ifdef CONFIG_PPC_SPLPAR
-       if (IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) &&
-           firmware_has_feature(FW_FEATURE_SPLPAR)) {
-               struct lppaca *lp = local_paca->lppaca_ptr;
-
-               if (unlikely(local_paca->dtl_ridx != be64_to_cpu(lp->dtl_idx)))
-                       accumulate_stolen_time();
-       }
-#endif
-
-       /*
-        * This is not required for the syscall exit path, but makes the
-        * stack frame look nicer. If this was initialised in the first stack
-        * frame, or if the unwinder was taught the first stack frame always
-        * returns to user with IRQS_ENABLED, this store could be avoided!
-        */
-       regs->softe = IRQS_ENABLED;
-
-       local_irq_enable();
-
-       if (unlikely(current_thread_info()->flags & _TIF_SYSCALL_DOTRACE)) {
-               if (unlikely(regs->trap == 0x7ff0)) {
-                       /* Unsupported scv vector */
-                       _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
-                       return regs->gpr[3];
-               }
-               /*
-                * We use the return value of do_syscall_trace_enter() as the
-                * syscall number. If the syscall was rejected for any reason
-                * do_syscall_trace_enter() returns an invalid syscall number
-                * and the test against NR_syscalls will fail and the return
-                * value to be used is in regs->gpr[3].
-                */
-               r0 = do_syscall_trace_enter(regs);
-               if (unlikely(r0 >= NR_syscalls))
-                       return regs->gpr[3];
-               r3 = regs->gpr[3];
-               r4 = regs->gpr[4];
-               r5 = regs->gpr[5];
-               r6 = regs->gpr[6];
-               r7 = regs->gpr[7];
-               r8 = regs->gpr[8];
-
-       } else if (unlikely(r0 >= NR_syscalls)) {
-               if (unlikely(regs->trap == 0x7ff0)) {
-                       /* Unsupported scv vector */
-                       _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
-                       return regs->gpr[3];
-               }
-               return -ENOSYS;
-       }
-
-       /* May be faster to do array_index_nospec? */
-       barrier_nospec();
-
-       if (unlikely(is_32bit_task())) {
-               f = (void *)compat_sys_call_table[r0];
-
-               r3 &= 0x00000000ffffffffULL;
-               r4 &= 0x00000000ffffffffULL;
-               r5 &= 0x00000000ffffffffULL;
-               r6 &= 0x00000000ffffffffULL;
-               r7 &= 0x00000000ffffffffULL;
-               r8 &= 0x00000000ffffffffULL;
-
-       } else {
-               f = (void *)sys_call_table[r0];
-       }
-
-       return f(r3, r4, r5, r6, r7, r8);
-}
-
-/*
- * local irqs must be disabled. Returns false if the caller must re-enable
- * them, check for new work, and try again.
- */
-static notrace inline bool prep_irq_for_enabled_exit(bool clear_ri)
-{
-       /* This must be done with RI=1 because tracing may touch vmaps */
-       trace_hardirqs_on();
-
-       /* This pattern matches prep_irq_for_idle */
-       if (clear_ri)
-               __hard_EE_RI_disable();
-       else
-               __hard_irq_disable();
-       if (unlikely(lazy_irq_pending_nocheck())) {
-               /* Took an interrupt, may have more exit work to do. */
-               if (clear_ri)
-                       __hard_RI_enable();
-               trace_hardirqs_off();
-               local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
-
-               return false;
-       }
-       local_paca->irq_happened = 0;
-       irq_soft_mask_set(IRQS_ENABLED);
-
-       return true;
-}
-
-/*
- * This should be called after a syscall returns, with r3 the return value
- * from the syscall. If this function returns non-zero, the system call
- * exit assembly should additionally load all GPR registers and CTR and XER
- * from the interrupt frame.
- *
- * The function graph tracer can not trace the return side of this function,
- * because RI=0 and soft mask state is "unreconciled", so it is marked notrace.
- */
-notrace unsigned long syscall_exit_prepare(unsigned long r3,
-                                          struct pt_regs *regs,
-                                          long scv)
-{
-       unsigned long *ti_flagsp = &current_thread_info()->flags;
-       unsigned long ti_flags;
-       unsigned long ret = 0;
-
-       kuap_check_amr();
-
-       regs->result = r3;
-
-       /* Check whether the syscall is issued inside a restartable sequence */
-       rseq_syscall(regs);
-
-       ti_flags = *ti_flagsp;
-
-       if (unlikely(r3 >= (unsigned long)-MAX_ERRNO) && !scv) {
-               if (likely(!(ti_flags & (_TIF_NOERROR | _TIF_RESTOREALL)))) {
-                       r3 = -r3;
-                       regs->ccr |= 0x10000000; /* Set SO bit in CR */
-               }
-       }
-
-       if (unlikely(ti_flags & _TIF_PERSYSCALL_MASK)) {
-               if (ti_flags & _TIF_RESTOREALL)
-                       ret = _TIF_RESTOREALL;
-               else
-                       regs->gpr[3] = r3;
-               clear_bits(_TIF_PERSYSCALL_MASK, ti_flagsp);
-       } else {
-               regs->gpr[3] = r3;
-       }
-
-       if (unlikely(ti_flags & _TIF_SYSCALL_DOTRACE)) {
-               do_syscall_trace_leave(regs);
-               ret |= _TIF_RESTOREALL;
-       }
-
-again:
-       local_irq_disable();
-       ti_flags = READ_ONCE(*ti_flagsp);
-       while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
-               local_irq_enable();
-               if (ti_flags & _TIF_NEED_RESCHED) {
-                       schedule();
-               } else {
-                       /*
-                        * SIGPENDING must restore signal handler function
-                        * argument GPRs, and some non-volatiles (e.g., r1).
-                        * Restore all for now. This could be made lighter.
-                        */
-                       if (ti_flags & _TIF_SIGPENDING)
-                               ret |= _TIF_RESTOREALL;
-                       do_notify_resume(regs, ti_flags);
-               }
-               local_irq_disable();
-               ti_flags = READ_ONCE(*ti_flagsp);
-       }
-
-       if (IS_ENABLED(CONFIG_PPC_BOOK3S) && IS_ENABLED(CONFIG_PPC_FPU)) {
-               if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
-                               unlikely((ti_flags & _TIF_RESTORE_TM))) {
-                       restore_tm_state(regs);
-               } else {
-                       unsigned long mathflags = MSR_FP;
-
-                       if (cpu_has_feature(CPU_FTR_VSX))
-                               mathflags |= MSR_VEC | MSR_VSX;
-                       else if (cpu_has_feature(CPU_FTR_ALTIVEC))
-                               mathflags |= MSR_VEC;
-
-                       /*
-                        * If userspace MSR has all available FP bits set,
-                        * then they are live and no need to restore. If not,
-                        * it means the regs were given up and restore_math
-                        * may decide to restore them (to avoid taking an FP
-                        * fault).
-                        */
-                       if ((regs->msr & mathflags) != mathflags)
-                               restore_math(regs);
-               }
-       }
-
-       /* scv need not set RI=0 because SRRs are not used */
-       if (unlikely(!prep_irq_for_enabled_exit(!scv))) {
-               local_irq_enable();
-               goto again;
-       }
-
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-       local_paca->tm_scratch = regs->msr;
-#endif
-
-       account_cpu_user_exit();
-
-#ifdef CONFIG_PPC_BOOK3S /* BOOK3E not yet using this */
-       /*
-        * We do this at the end so that we do context switch with KERNEL AMR
-        */
-       kuap_user_restore(regs);
-#endif
-       return ret;
-}
-
-#ifdef CONFIG_PPC_BOOK3S /* BOOK3E not yet using this */
-notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned long msr)
-{
-#ifdef CONFIG_PPC_BOOK3E
-       struct thread_struct *ts = &current->thread;
-#endif
-       unsigned long *ti_flagsp = &current_thread_info()->flags;
-       unsigned long ti_flags;
-       unsigned long flags;
-       unsigned long ret = 0;
-
-       if (IS_ENABLED(CONFIG_PPC_BOOK3S))
-               BUG_ON(!(regs->msr & MSR_RI));
-       BUG_ON(!(regs->msr & MSR_PR));
-       BUG_ON(!FULL_REGS(regs));
-       BUG_ON(regs->softe != IRQS_ENABLED);
-
-       /*
-        * We don't need to restore AMR on the way back to userspace for KUAP.
-        * AMR can only have been unlocked if we interrupted the kernel.
-        */
-       kuap_check_amr();
-
-       local_irq_save(flags);
-
-again:
-       ti_flags = READ_ONCE(*ti_flagsp);
-       while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
-               local_irq_enable(); /* returning to user: may enable */
-               if (ti_flags & _TIF_NEED_RESCHED) {
-                       schedule();
-               } else {
-                       if (ti_flags & _TIF_SIGPENDING)
-                               ret |= _TIF_RESTOREALL;
-                       do_notify_resume(regs, ti_flags);
-               }
-               local_irq_disable();
-               ti_flags = READ_ONCE(*ti_flagsp);
-       }
-
-       if (IS_ENABLED(CONFIG_PPC_BOOK3S) && IS_ENABLED(CONFIG_PPC_FPU)) {
-               if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
-                               unlikely((ti_flags & _TIF_RESTORE_TM))) {
-                       restore_tm_state(regs);
-               } else {
-                       unsigned long mathflags = MSR_FP;
-
-                       if (cpu_has_feature(CPU_FTR_VSX))
-                               mathflags |= MSR_VEC | MSR_VSX;
-                       else if (cpu_has_feature(CPU_FTR_ALTIVEC))
-                               mathflags |= MSR_VEC;
-
-                       /* See above restore_math comment */
-                       if ((regs->msr & mathflags) != mathflags)
-                               restore_math(regs);
-               }
-       }
-
-       if (unlikely(!prep_irq_for_enabled_exit(true))) {
-               local_irq_enable();
-               local_irq_disable();
-               goto again;
-       }
-
-#ifdef CONFIG_PPC_BOOK3E
-       if (unlikely(ts->debug.dbcr0 & DBCR0_IDM)) {
-               /*
-                * Check to see if the dbcr0 register is set up to debug.
-                * Use the internal debug mode bit to do this.
-                */
-               mtmsr(mfmsr() & ~MSR_DE);
-               mtspr(SPRN_DBCR0, ts->debug.dbcr0);
-               mtspr(SPRN_DBSR, -1);
-       }
-#endif
-
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-       local_paca->tm_scratch = regs->msr;
-#endif
-
-       account_cpu_user_exit();
-
-       /*
-        * We do this at the end so that we do context switch with KERNEL AMR
-        */
-       kuap_user_restore(regs);
-       return ret;
-}
-
-void unrecoverable_exception(struct pt_regs *regs);
-void preempt_schedule_irq(void);
-
-notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsigned long msr)
-{
-       unsigned long *ti_flagsp = &current_thread_info()->flags;
-       unsigned long flags;
-       unsigned long ret = 0;
-       unsigned long amr;
-
-       if (IS_ENABLED(CONFIG_PPC_BOOK3S) && unlikely(!(regs->msr & MSR_RI)))
-               unrecoverable_exception(regs);
-       BUG_ON(regs->msr & MSR_PR);
-       BUG_ON(!FULL_REGS(regs));
-
-       amr = kuap_get_and_check_amr();
-
-       if (unlikely(*ti_flagsp & _TIF_EMULATE_STACK_STORE)) {
-               clear_bits(_TIF_EMULATE_STACK_STORE, ti_flagsp);
-               ret = 1;
-       }
-
-       local_irq_save(flags);
-
-       if (regs->softe == IRQS_ENABLED) {
-               /* Returning to a kernel context with local irqs enabled. */
-               WARN_ON_ONCE(!(regs->msr & MSR_EE));
-again:
-               if (IS_ENABLED(CONFIG_PREEMPT)) {
-                       /* Return to preemptible kernel context */
-                       if (unlikely(*ti_flagsp & _TIF_NEED_RESCHED)) {
-                               if (preempt_count() == 0)
-                                       preempt_schedule_irq();
-                       }
-               }
-
-               if (unlikely(!prep_irq_for_enabled_exit(true))) {
-                       /*
-                        * Can't local_irq_restore to replay if we were in
-                        * interrupt context. Must replay directly.
-                        */
-                       if (irqs_disabled_flags(flags)) {
-                               replay_soft_interrupts();
-                       } else {
-                               local_irq_restore(flags);
-                               local_irq_save(flags);
-                       }
-                       /* Took an interrupt, may have more exit work to do. */
-                       goto again;
-               }
-       } else {
-               /* Returning to a kernel context with local irqs disabled. */
-               __hard_EE_RI_disable();
-               if (regs->msr & MSR_EE)
-                       local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
-       }
-
-
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-       local_paca->tm_scratch = regs->msr;
-#endif
-
-       /*
-        * Don't want to mfspr(SPRN_AMR) here, because this comes after mtmsr,
-        * which would cause Read-After-Write stalls. Hence, we take the AMR
-        * value from the check above.
-        */
-       kuap_kernel_restore(regs, amr);
-
-       return ret;
-}
-#endif
index f744eb5..96b2157 100644 (file)
@@ -9,9 +9,7 @@
 #
 0      nospu   restart_syscall                 sys_restart_syscall
 1      nospu   exit                            sys_exit
-2      32      fork                            ppc_fork                        sys_fork
-2      64      fork                            sys_fork
-2      spu     fork                            sys_ni_syscall
+2      nospu   fork                            sys_fork
 3      common  read                            sys_read
 4      common  write                           sys_write
 5      common  open                            sys_open                        compat_sys_open
 119    32      sigreturn                       sys_sigreturn                   compat_sys_sigreturn
 119    64      sigreturn                       sys_ni_syscall
 119    spu     sigreturn                       sys_ni_syscall
-120    32      clone                           ppc_clone                       sys_clone
-120    64      clone                           sys_clone
-120    spu     clone                           sys_ni_syscall
+120    nospu   clone                           sys_clone
 121    common  setdomainname                   sys_setdomainname
 122    common  uname                           sys_newuname
 123    common  modify_ldt                      sys_ni_syscall
 186    spu     sendfile                        sys_sendfile64
 187    common  getpmsg                         sys_ni_syscall
 188    common  putpmsg                         sys_ni_syscall
-189    32      vfork                           ppc_vfork                       sys_vfork
-189    64      vfork                           sys_vfork
-189    spu     vfork                           sys_ni_syscall
+189    nospu   vfork                           sys_vfork
 190    common  ugetrlimit                      sys_getrlimit                   compat_sys_getrlimit
 191    common  readahead                       sys_readahead                   compat_sys_readahead
 192    32      mmap2                           sys_mmap2                       compat_sys_mmap2
 248    32      clock_nanosleep                 sys_clock_nanosleep_time32
 248    64      clock_nanosleep                 sys_clock_nanosleep
 248    spu     clock_nanosleep                 sys_clock_nanosleep
-249    32      swapcontext                     ppc_swapcontext                 compat_sys_swapcontext
-249    64      swapcontext                     sys_swapcontext
-249    spu     swapcontext                     sys_ni_syscall
+249    nospu   swapcontext                     sys_swapcontext                 compat_sys_swapcontext
 250    common  tgkill                          sys_tgkill
 251    32      utimes                          sys_utimes_time32
 251    64      utimes                          sys_utimes
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
 434    common  pidfd_open                      sys_pidfd_open
-435    32      clone3                          ppc_clone3                      sys_clone3
-435    64      clone3                          sys_clone3
-435    spu     clone3                          sys_ni_syscall
+435    nospu   clone3                          sys_clone3
 436    common  close_range                     sys_close_range
 437    common  openat2                         sys_openat2
 438    common  pidfd_getfd                     sys_pidfd_getfd
index 0b4694b..6c31af7 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/delay.h>
 #include <linux/workqueue.h>
 
+#include <asm/interrupt.h>
 #include <asm/io.h>
 #include <asm/reg.h>
 #include <asm/nvram.h>
@@ -100,16 +101,13 @@ static void TAUupdate(int cpu)
  * with interrupts disabled
  */
 
-void TAUException(struct pt_regs * regs)
+DEFINE_INTERRUPT_HANDLER_ASYNC(TAUException)
 {
        int cpu = smp_processor_id();
 
-       irq_enter();
        tau[cpu].interrupts++;
 
        TAUupdate(cpu);
-
-       irq_exit();
 }
 #endif /* CONFIG_TAU_INT */
 
index 67feb35..b67d93a 100644 (file)
 #include <linux/of_clk.h>
 #include <linux/suspend.h>
 #include <linux/sched/cputime.h>
+#include <linux/sched/clock.h>
 #include <linux/processor.h>
 #include <asm/trace.h>
 
+#include <asm/interrupt.h>
 #include <asm/io.h>
 #include <asm/nvram.h>
 #include <asm/cache.h>
@@ -570,7 +572,7 @@ void arch_irq_work_raise(void)
  * timer_interrupt - gets called when the decrementer overflows,
  * with interrupts disabled.
  */
-void timer_interrupt(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt)
 {
        struct clock_event_device *evt = this_cpu_ptr(&decrementers);
        u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
@@ -609,7 +611,7 @@ void timer_interrupt(struct pt_regs *regs)
 #endif
 
        old_regs = set_irq_regs(regs);
-       irq_enter();
+
        trace_timer_interrupt_entry(regs);
 
        if (test_irq_work_pending()) {
@@ -634,7 +636,7 @@ void timer_interrupt(struct pt_regs *regs)
        }
 
        trace_timer_interrupt_exit(regs);
-       irq_exit();
+
        set_irq_regs(old_regs);
 }
 EXPORT_SYMBOL(timer_interrupt);
@@ -1030,6 +1032,7 @@ void __init time_init(void)
        tick_setup_hrtimer_broadcast();
 
        of_clk_init(NULL);
+       enable_sched_clock_irqtime();
 }
 
 /*
index 3ec7b44..1583fd1 100644 (file)
@@ -41,6 +41,7 @@
 #include <asm/emulated_ops.h>
 #include <linux/uaccess.h>
 #include <asm/debugfs.h>
+#include <asm/interrupt.h>
 #include <asm/io.h>
 #include <asm/machdep.h>
 #include <asm/rtas.h>
@@ -342,8 +343,8 @@ static bool exception_common(int signr, struct pt_regs *regs, int code,
 
        show_signal_msg(signr, regs, code, addr);
 
-       if (arch_irqs_disabled() && !arch_irq_disabled_regs(regs))
-               local_irq_enable();
+       if (arch_irqs_disabled())
+               interrupt_cond_local_irq_enable(regs);
 
        current->thread.trap_nr = code;
 
@@ -430,16 +431,10 @@ nonrecoverable:
        regs->msr &= ~MSR_RI;
 #endif
 }
-
-void system_reset_exception(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER_NMI(system_reset_exception)
 {
        unsigned long hsrr0, hsrr1;
        bool saved_hsrrs = false;
-       u8 ftrace_enabled = this_cpu_get_ftrace_enabled();
-
-       this_cpu_set_ftrace_enabled(0);
-
-       nmi_enter();
 
        /*
         * System reset can interrupt code where HSRRs are live and MSR[RI]=1.
@@ -503,19 +498,20 @@ out:
                die("Unrecoverable nested System Reset", regs, SIGABRT);
 #endif
        /* Must die if the interrupt is not recoverable */
-       if (!(regs->msr & MSR_RI))
+       if (!(regs->msr & MSR_RI)) {
+               /* For the reason explained in die_mce, nmi_exit before die */
+               nmi_exit();
                die("Unrecoverable System Reset", regs, SIGABRT);
+       }
 
        if (saved_hsrrs) {
                mtspr(SPRN_HSRR0, hsrr0);
                mtspr(SPRN_HSRR1, hsrr1);
        }
 
-       nmi_exit();
-
-       this_cpu_set_ftrace_enabled(ftrace_enabled);
-
        /* What should we do here? We could issue a shutdown or hard reset. */
+
+       return 0;
 }
 
 /*
@@ -788,23 +784,33 @@ int machine_check_generic(struct pt_regs *regs)
 }
 #endif /* everything else */
 
-void machine_check_exception(struct pt_regs *regs)
+void die_mce(const char *str, struct pt_regs *regs, long err)
 {
-       int recover = 0;
-
        /*
-        * BOOK3S_64 does not call this handler as a non-maskable interrupt
-        * (it uses its own early real-mode handler to handle the MCE proper
-        * and then raises irq_work to call this handler when interrupts are
-        * enabled).
-        *
-        * This is silly. The BOOK3S_64 should just call a different function
-        * rather than expecting semantics to magically change. Something
-        * like 'non_nmi_machine_check_exception()', perhaps?
+        * The machine check wants to kill the interrupted context, but
+        * do_exit() checks for in_interrupt() and panics in that case, so
+        * exit the irq/nmi before calling die.
         */
-       const bool nmi = !IS_ENABLED(CONFIG_PPC_BOOK3S_64);
+       if (IS_ENABLED(CONFIG_PPC_BOOK3S_64))
+               irq_exit();
+       else
+               nmi_exit();
+       die(str, regs, err);
+}
 
-       if (nmi) nmi_enter();
+/*
+ * BOOK3S_64 does not call this handler as a non-maskable interrupt
+ * (it uses its own early real-mode handler to handle the MCE proper
+ * and then raises irq_work to call this handler when interrupts are
+ * enabled).
+ */
+#ifdef CONFIG_PPC_BOOK3S_64
+DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception)
+#else
+DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
+#endif
+{
+       int recover = 0;
 
        __this_cpu_inc(irq_stat.mce_exceptions);
 
@@ -830,21 +836,21 @@ void machine_check_exception(struct pt_regs *regs)
        if (check_io_access(regs))
                goto bail;
 
-       if (nmi) nmi_exit();
-
-       die("Machine check", regs, SIGBUS);
+       die_mce("Machine check", regs, SIGBUS);
 
+bail:
        /* Must die if the interrupt is not recoverable */
        if (!(regs->msr & MSR_RI))
-               die("Unrecoverable Machine check", regs, SIGBUS);
+               die_mce("Unrecoverable Machine check", regs, SIGBUS);
 
+#ifdef CONFIG_PPC_BOOK3S_64
        return;
-
-bail:
-       if (nmi) nmi_exit();
+#else
+       return 0;
+#endif
 }
 
-void SMIException(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(SMIException) /* async? */
 {
        die("System Management Interrupt", regs, SIGABRT);
 }
@@ -1030,12 +1036,11 @@ static void p9_hmi_special_emu(struct pt_regs *regs)
 }
 #endif /* CONFIG_VSX */
 
-void handle_hmi_exception(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER_ASYNC(handle_hmi_exception)
 {
        struct pt_regs *old_regs;
 
        old_regs = set_irq_regs(regs);
-       irq_enter();
 
 #ifdef CONFIG_VSX
        /* Real mode flagged P9 special emu is needed */
@@ -1055,46 +1060,42 @@ void handle_hmi_exception(struct pt_regs *regs)
        if (ppc_md.handle_hmi_exception)
                ppc_md.handle_hmi_exception(regs);
 
-       irq_exit();
        set_irq_regs(old_regs);
 }
 
-void unknown_exception(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(unknown_exception)
 {
-       enum ctx_state prev_state = exception_enter();
-
        printk("Bad trap at PC: %lx, SR: %lx, vector=%lx\n",
               regs->nip, regs->msr, regs->trap);
 
        _exception(SIGTRAP, regs, TRAP_UNK, 0);
-
-       exception_exit(prev_state);
 }
 
-void instruction_breakpoint_exception(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER_ASYNC(unknown_async_exception)
 {
-       enum ctx_state prev_state = exception_enter();
+       printk("Bad trap at PC: %lx, SR: %lx, vector=%lx\n",
+              regs->nip, regs->msr, regs->trap);
 
+       _exception(SIGTRAP, regs, TRAP_UNK, 0);
+}
+
+DEFINE_INTERRUPT_HANDLER(instruction_breakpoint_exception)
+{
        if (notify_die(DIE_IABR_MATCH, "iabr_match", regs, 5,
                                        5, SIGTRAP) == NOTIFY_STOP)
-               goto bail;
+               return;
        if (debugger_iabr_match(regs))
-               goto bail;
+               return;
        _exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip);
-
-bail:
-       exception_exit(prev_state);
 }
 
-void RunModeException(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(RunModeException)
 {
        _exception(SIGTRAP, regs, TRAP_UNK, 0);
 }
 
-void single_step_exception(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(single_step_exception)
 {
-       enum ctx_state prev_state = exception_enter();
-
        clear_single_step(regs);
        clear_br_trace(regs);
 
@@ -1103,16 +1104,12 @@ void single_step_exception(struct pt_regs *regs)
 
        if (notify_die(DIE_SSTEP, "single_step", regs, 5,
                                        5, SIGTRAP) == NOTIFY_STOP)
-               goto bail;
+               return;
        if (debugger_sstep(regs))
-               goto bail;
+               return;
 
        _exception(SIGTRAP, regs, TRAP_TRACE, regs->nip);
-
-bail:
-       exception_exit(prev_state);
 }
-NOKPROBE_SYMBOL(single_step_exception);
 
 /*
  * After we have successfully emulated an instruction, we have to
@@ -1436,9 +1433,8 @@ static int emulate_math(struct pt_regs *regs)
 static inline int emulate_math(struct pt_regs *regs) { return -1; }
 #endif
 
-void program_check_exception(struct pt_regs *regs)
+static void do_program_check(struct pt_regs *regs)
 {
-       enum ctx_state prev_state = exception_enter();
        unsigned int reason = get_reason(regs);
 
        /* We can now get here via a FP Unavailable exception if the core
@@ -1447,22 +1443,22 @@ void program_check_exception(struct pt_regs *regs)
        if (reason & REASON_FP) {
                /* IEEE FP exception */
                parse_fpe(regs);
-               goto bail;
+               return;
        }
        if (reason & REASON_TRAP) {
                unsigned long bugaddr;
                /* Debugger is first in line to stop recursive faults in
                 * rcu_lock, notify_die, or atomic_notifier_call_chain */
                if (debugger_bpt(regs))
-                       goto bail;
+                       return;
 
                if (kprobe_handler(regs))
-                       goto bail;
+                       return;
 
                /* trap exception */
                if (notify_die(DIE_BPT, "breakpoint", regs, 5, 5, SIGTRAP)
                                == NOTIFY_STOP)
-                       goto bail;
+                       return;
 
                bugaddr = regs->nip;
                /*
@@ -1474,10 +1470,10 @@ void program_check_exception(struct pt_regs *regs)
                if (!(regs->msr & MSR_PR) &&  /* not user-mode */
                    report_bug(bugaddr, regs) == BUG_TRAP_TYPE_WARN) {
                        regs->nip += 4;
-                       goto bail;
+                       return;
                }
                _exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip);
-               goto bail;
+               return;
        }
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
        if (reason & REASON_TM) {
@@ -1498,7 +1494,7 @@ void program_check_exception(struct pt_regs *regs)
                 */
                if (user_mode(regs)) {
                        _exception(SIGILL, regs, ILL_ILLOPN, regs->nip);
-                       goto bail;
+                       return;
                } else {
                        printk(KERN_EMERG "Unexpected TM Bad Thing exception "
                               "at %lx (msr 0x%lx) tm_scratch=%llx\n",
@@ -1518,9 +1514,7 @@ void program_check_exception(struct pt_regs *regs)
        if (!user_mode(regs))
                goto sigill;
 
-       /* We restore the interrupt state now */
-       if (!arch_irq_disabled_regs(regs))
-               local_irq_enable();
+       interrupt_cond_local_irq_enable(regs);
 
        /* (reason & REASON_ILLEGAL) would be the obvious thing here,
         * but there seems to be a hardware bug on the 405GP (RevD)
@@ -1531,7 +1525,7 @@ void program_check_exception(struct pt_regs *regs)
         * pattern to occurrences etc. -dgibson 31/Mar/2003
         */
        if (!emulate_math(regs))
-               goto bail;
+               return;
 
        /* Try to emulate it if we should. */
        if (reason & (REASON_ILLEGAL | REASON_PRIVILEGED)) {
@@ -1539,10 +1533,10 @@ void program_check_exception(struct pt_regs *regs)
                case 0:
                        regs->nip += 4;
                        emulate_single_step(regs);
-                       goto bail;
+                       return;
                case -EFAULT:
                        _exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip);
-                       goto bail;
+                       return;
                }
        }
 
@@ -1552,34 +1546,31 @@ sigill:
        else
                _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
 
-bail:
-       exception_exit(prev_state);
 }
-NOKPROBE_SYMBOL(program_check_exception);
+
+DEFINE_INTERRUPT_HANDLER(program_check_exception)
+{
+       do_program_check(regs);
+}
 
 /*
  * This occurs when running in hypervisor mode on POWER6 or later
  * and an illegal instruction is encountered.
  */
-void emulation_assist_interrupt(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(emulation_assist_interrupt)
 {
        regs->msr |= REASON_ILLEGAL;
-       program_check_exception(regs);
+       do_program_check(regs);
 }
-NOKPROBE_SYMBOL(emulation_assist_interrupt);
 
-void alignment_exception(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(alignment_exception)
 {
-       enum ctx_state prev_state = exception_enter();
        int sig, code, fixed = 0;
        unsigned long  reason;
 
-       /* We restore the interrupt state now */
-       if (!arch_irq_disabled_regs(regs))
-               local_irq_enable();
+       interrupt_cond_local_irq_enable(regs);
 
        reason = get_reason(regs);
-
        if (reason & REASON_BOUNDARY) {
                sig = SIGBUS;
                code = BUS_ADRALN;
@@ -1587,7 +1578,7 @@ void alignment_exception(struct pt_regs *regs)
        }
 
        if (tm_abort_check(regs, TM_CAUSE_ALIGNMENT | TM_CAUSE_PERSISTENT))
-               goto bail;
+               return;
 
        /* we don't implement logging of alignment exceptions */
        if (!(current->thread.align_ctl & PR_UNALIGN_SIGBUS))
@@ -1597,7 +1588,7 @@ void alignment_exception(struct pt_regs *regs)
                /* skip over emulated instruction */
                regs->nip += inst_length(reason);
                emulate_single_step(regs);
-               goto bail;
+               return;
        }
 
        /* Operand address was bad */
@@ -1612,13 +1603,10 @@ bad:
        if (user_mode(regs))
                _exception(sig, regs, code, regs->dar);
        else
-               bad_page_fault(regs, regs->dar, sig);
-
-bail:
-       exception_exit(prev_state);
+               bad_page_fault(regs, sig);
 }
 
-void StackOverflow(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(StackOverflow)
 {
        pr_crit("Kernel stack overflow in process %s[%d], r1=%lx\n",
                current->comm, task_pid_nr(current), regs->gpr[1]);
@@ -1627,46 +1615,33 @@ void StackOverflow(struct pt_regs *regs)
        panic("kernel stack overflow");
 }
 
-void stack_overflow_exception(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(stack_overflow_exception)
 {
-       enum ctx_state prev_state = exception_enter();
-
        die("Kernel stack overflow", regs, SIGSEGV);
-
-       exception_exit(prev_state);
 }
 
-void kernel_fp_unavailable_exception(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(kernel_fp_unavailable_exception)
 {
-       enum ctx_state prev_state = exception_enter();
-
        printk(KERN_EMERG "Unrecoverable FP Unavailable Exception "
                          "%lx at %lx\n", regs->trap, regs->nip);
        die("Unrecoverable FP Unavailable Exception", regs, SIGABRT);
-
-       exception_exit(prev_state);
 }
 
-void altivec_unavailable_exception(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(altivec_unavailable_exception)
 {
-       enum ctx_state prev_state = exception_enter();
-
        if (user_mode(regs)) {
                /* A user program has executed an altivec instruction,
                   but this kernel doesn't support altivec. */
                _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
-               goto bail;
+               return;
        }
 
        printk(KERN_EMERG "Unrecoverable VMX/Altivec Unavailable Exception "
                        "%lx at %lx\n", regs->trap, regs->nip);
        die("Unrecoverable VMX/Altivec Unavailable Exception", regs, SIGABRT);
-
-bail:
-       exception_exit(prev_state);
 }
 
-void vsx_unavailable_exception(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(vsx_unavailable_exception)
 {
        if (user_mode(regs)) {
                /* A user program has executed an vsx instruction,
@@ -1697,7 +1672,7 @@ static void tm_unavailable(struct pt_regs *regs)
        die("Unrecoverable TM Unavailable Exception", regs, SIGABRT);
 }
 
-void facility_unavailable_exception(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(facility_unavailable_exception)
 {
        static char *facility_strings[] = {
                [FSCR_FP_LG] = "FPU",
@@ -1737,9 +1712,7 @@ void facility_unavailable_exception(struct pt_regs *regs)
                die("Unexpected facility unavailable exception", regs, SIGABRT);
        }
 
-       /* We restore the interrupt state now */
-       if (!arch_irq_disabled_regs(regs))
-               local_irq_enable();
+       interrupt_cond_local_irq_enable(regs);
 
        if (status == FSCR_DSCR_LG) {
                /*
@@ -1817,7 +1790,7 @@ out:
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 
-void fp_unavailable_tm(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(fp_unavailable_tm)
 {
        /* Note:  This does not handle any kind of FP laziness. */
 
@@ -1850,7 +1823,7 @@ void fp_unavailable_tm(struct pt_regs *regs)
        tm_recheckpoint(&current->thread);
 }
 
-void altivec_unavailable_tm(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(altivec_unavailable_tm)
 {
        /* See the comments in fp_unavailable_tm().  This function operates
         * the same way.
@@ -1865,7 +1838,7 @@ void altivec_unavailable_tm(struct pt_regs *regs)
        current->thread.used_vr = 1;
 }
 
-void vsx_unavailable_tm(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(vsx_unavailable_tm)
 {
        /* See the comments in fp_unavailable_tm().  This works similarly,
         * though we're loading both FP and VEC registers in here.
@@ -1890,11 +1863,40 @@ void vsx_unavailable_tm(struct pt_regs *regs)
 }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
-void performance_monitor_exception(struct pt_regs *regs)
+#ifdef CONFIG_PPC64
+DECLARE_INTERRUPT_HANDLER_NMI(performance_monitor_exception_nmi);
+DEFINE_INTERRUPT_HANDLER_NMI(performance_monitor_exception_nmi)
 {
        __this_cpu_inc(irq_stat.pmu_irqs);
 
        perf_irq(regs);
+
+       return 0;
+}
+#endif
+
+DECLARE_INTERRUPT_HANDLER_ASYNC(performance_monitor_exception_async);
+DEFINE_INTERRUPT_HANDLER_ASYNC(performance_monitor_exception_async)
+{
+       __this_cpu_inc(irq_stat.pmu_irqs);
+
+       perf_irq(regs);
+}
+
+DEFINE_INTERRUPT_HANDLER_RAW(performance_monitor_exception)
+{
+       /*
+        * On 64-bit, if perf interrupts hit in a local_irq_disable
+        * (soft-masked) region, we consider them as NMIs. This is required to
+        * prevent hash faults on user addresses when reading callchains (and
+        * looks better from an irq tracing perspective).
+        */
+       if (IS_ENABLED(CONFIG_PPC64) && unlikely(arch_irq_disabled_regs(regs)))
+               performance_monitor_exception_nmi(regs);
+       else
+               performance_monitor_exception_async(regs);
+
+       return 0;
 }
 
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
@@ -1957,8 +1959,10 @@ static void handle_debug(struct pt_regs *regs, unsigned long debug_status)
                mtspr(SPRN_DBCR0, current->thread.debug.dbcr0);
 }
 
-void DebugException(struct pt_regs *regs, unsigned long debug_status)
+DEFINE_INTERRUPT_HANDLER(DebugException)
 {
+       unsigned long debug_status = regs->dsisr;
+
        current->thread.debug.dbsr = debug_status;
 
        /* Hack alert: On BookE, Branch Taken stops on the branch itself, while
@@ -2024,11 +2028,10 @@ void DebugException(struct pt_regs *regs, unsigned long debug_status)
        } else
                handle_debug(regs, debug_status);
 }
-NOKPROBE_SYMBOL(DebugException);
 #endif /* CONFIG_PPC_ADV_DEBUG_REGS */
 
 #ifdef CONFIG_ALTIVEC
-void altivec_assist_exception(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(altivec_assist_exception)
 {
        int err;
 
@@ -2062,9 +2065,10 @@ void altivec_assist_exception(struct pt_regs *regs)
 #endif /* CONFIG_ALTIVEC */
 
 #ifdef CONFIG_FSL_BOOKE
-void CacheLockingException(struct pt_regs *regs, unsigned long address,
-                          unsigned long error_code)
+DEFINE_INTERRUPT_HANDLER(CacheLockingException)
 {
+       unsigned long error_code = regs->dsisr;
+
        /* We treat cache locking instructions from the user
         * as priv ops, in the future we could try to do
         * something smarter
@@ -2076,7 +2080,7 @@ void CacheLockingException(struct pt_regs *regs, unsigned long address,
 #endif /* CONFIG_FSL_BOOKE */
 
 #ifdef CONFIG_SPE
-void SPEFloatingPointException(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(SPEFloatingPointException)
 {
        extern int do_spe_mathemu(struct pt_regs *regs);
        unsigned long spefscr;
@@ -2084,9 +2088,7 @@ void SPEFloatingPointException(struct pt_regs *regs)
        int code = FPE_FLTUNK;
        int err;
 
-       /* We restore the interrupt state now */
-       if (!arch_irq_disabled_regs(regs))
-               local_irq_enable();
+       interrupt_cond_local_irq_enable(regs);
 
        flush_spe_to_thread(current);
 
@@ -2128,14 +2130,12 @@ void SPEFloatingPointException(struct pt_regs *regs)
        return;
 }
 
-void SPEFloatingPointRoundException(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(SPEFloatingPointRoundException)
 {
        extern int speround_handler(struct pt_regs *regs);
        int err;
 
-       /* We restore the interrupt state now */
-       if (!arch_irq_disabled_regs(regs))
-               local_irq_enable();
+       interrupt_cond_local_irq_enable(regs);
 
        preempt_disable();
        if (regs->msr & MSR_SPE)
@@ -2170,13 +2170,12 @@ void SPEFloatingPointRoundException(struct pt_regs *regs)
  * in the MSR is 0.  This indicates that SRR0/1 are live, and that
  * we therefore lost state by taking this exception.
  */
-void unrecoverable_exception(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(unrecoverable_exception)
 {
        pr_emerg("Unrecoverable exception %lx at %lx (msr=%lx)\n",
                 regs->trap, regs->nip, regs->msr);
        die("Unrecoverable exception", regs, SIGABRT);
 }
-NOKPROBE_SYMBOL(unrecoverable_exception);
 
 #if defined(CONFIG_BOOKE_WDT) || defined(CONFIG_40x)
 /*
@@ -2190,7 +2189,7 @@ void __attribute__ ((weak)) WatchdogHandler(struct pt_regs *regs)
        return;
 }
 
-void WatchdogException(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(WatchdogException) /* XXX NMI? async? */
 {
        printk (KERN_EMERG "PowerPC Book-E Watchdog Exception\n");
        WatchdogHandler(regs);
@@ -2201,13 +2200,12 @@ void WatchdogException(struct pt_regs *regs)
  * We enter here if we discover during exception entry that we are
  * running in supervisor mode with a userspace value in the stack pointer.
  */
-void kernel_bad_stack(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(kernel_bad_stack)
 {
        printk(KERN_EMERG "Bad kernel stack pointer %lx at %lx\n",
               regs->gpr[1], regs->nip);
        die("Bad kernel stack pointer", regs, SIGABRT);
 }
-NOKPROBE_SYMBOL(kernel_bad_stack);
 
 void __init trap_init(void)
 {
index af3c15a..c9a8f47 100644 (file)
@@ -26,7 +26,9 @@
 #include <linux/delay.h>
 #include <linux/smp.h>
 
+#include <asm/interrupt.h>
 #include <asm/paca.h>
+#include <asm/nmi.h>
 
 /*
  * The powerpc watchdog ensures that each CPU is able to service timers.
@@ -247,16 +249,17 @@ static void watchdog_timer_interrupt(int cpu)
                watchdog_smp_panic(cpu, tb);
 }
 
-void soft_nmi_interrupt(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
 {
        unsigned long flags;
        int cpu = raw_smp_processor_id();
        u64 tb;
 
-       if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
-               return;
+       /* should only arrive from kernel, with irqs disabled */
+       WARN_ON_ONCE(!arch_irq_disabled_regs(regs));
 
-       nmi_enter();
+       if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
+               return 0;
 
        __this_cpu_inc(irq_stat.soft_nmi_irqs);
 
@@ -265,7 +268,7 @@ void soft_nmi_interrupt(struct pt_regs *regs)
                wd_smp_lock(&flags);
                if (cpumask_test_cpu(cpu, &wd_smp_cpus_stuck)) {
                        wd_smp_unlock(&flags);
-                       goto out;
+                       return 0;
                }
                set_cpu_stuck(cpu, tb);
 
@@ -289,8 +292,7 @@ void soft_nmi_interrupt(struct pt_regs *regs)
        if (wd_panic_timeout_tb < 0x7fffffff)
                mtspr(SPRN_DEC, wd_panic_timeout_tb);
 
-out:
-       nmi_exit();
+       return 0;
 }
 
 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
index d0e459b..9842e33 100644 (file)
@@ -102,7 +102,7 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
                pr_debug("Loaded initrd at 0x%lx\n", initrd_load_addr);
        }
 
-       fdt_size = fdt_totalsize(initial_boot_params) * 2;
+       fdt_size = kexec_fdt_totalsize_ppc64(image);
        fdt = kmalloc(fdt_size, GFP_KERNEL);
        if (!fdt) {
                pr_err("Not enough memory for the device tree.\n");
index c69bcf9..02b9e4d 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/memblock.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <asm/setup.h>
 #include <asm/drmem.h>
 #include <asm/kexec_ranges.h>
 #include <asm/crashdump-ppc64.h>
@@ -925,6 +926,40 @@ out:
        return ret;
 }
 
+/**
+ * kexec_fdt_totalsize_ppc64 - Return the estimated size needed to setup FDT
+ *                             for kexec/kdump kernel.
+ * @image:                     kexec image being loaded.
+ *
+ * Returns the estimated size needed for kexec/kdump kernel FDT.
+ */
+unsigned int kexec_fdt_totalsize_ppc64(struct kimage *image)
+{
+       unsigned int fdt_size;
+       u64 usm_entries;
+
+       /*
+        * The below estimate more than accounts for a typical kexec case where
+        * the additional space is to accommodate things like kexec cmdline,
+        * chosen node with properties for initrd start & end addresses and
+        * a property to indicate kexec boot..
+        */
+       fdt_size = fdt_totalsize(initial_boot_params) + (2 * COMMAND_LINE_SIZE);
+       if (image->type != KEXEC_TYPE_CRASH)
+               return fdt_size;
+
+       /*
+        * For kdump kernel, also account for linux,usable-memory and
+        * linux,drconf-usable-memory properties. Get an approximate on the
+        * number of usable memory entries and use for FDT size estimation.
+        */
+       usm_entries = ((memblock_end_of_DRAM() / drmem_lmb_size()) +
+                      (2 * (resource_size(&crashk_res) / drmem_lmb_size())));
+       fdt_size += (unsigned int)(usm_entries * sizeof(u64));
+
+       return fdt_size;
+}
+
 /**
  * setup_new_fdt_ppc64 - Update the flattend device-tree of the kernel
  *                       being loaded.
index 549591d..e456446 100644 (file)
@@ -54,6 +54,7 @@ config KVM_BOOK3S_32
        select KVM
        select KVM_BOOK3S_32_HANDLER
        select KVM_BOOK3S_PR_POSSIBLE
+       select PPC_FPU
        help
          Support running unmodified book3s_32 guest kernels
          in virtual machines on book3s_32 host processors.
index 38ea396..c77f2d4 100644 (file)
@@ -27,6 +27,7 @@
 #include <asm/cputable.h>
 #include <asm/pte-walk.h>
 
+#include "book3s.h"
 #include "trace_hv.h"
 
 //#define DEBUG_RESIZE_HPT     1
index b08cc15..fdb57be 100644 (file)
 #define SPRN_GQR6              918
 #define SPRN_GQR7              919
 
-/* Book3S_32 defines mfsrin(v) - but that messes up our abstract
- * function pointers, so let's just disable the define. */
-#undef mfsrin
-
 enum priv_level {
        PRIV_PROBLEM = 0,
        PRIV_SUPER = 1,
index f09708d..13bad6b 100644 (file)
@@ -53,6 +53,7 @@
 #include <asm/cputable.h>
 #include <asm/cacheflush.h>
 #include <linux/uaccess.h>
+#include <asm/interrupt.h>
 #include <asm/io.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
@@ -3408,8 +3409,9 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 
        kvmppc_set_host_core(pcpu);
 
+       guest_exit_irqoff();
+
        local_irq_enable();
-       guest_exit();
 
        /* Let secondaries go back to the offline loop */
        for (i = 0; i < controlled_threads; ++i) {
@@ -4236,8 +4238,9 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
 
        kvmppc_set_host_core(pcpu);
 
+       guest_exit_irqoff();
+
        local_irq_enable();
-       guest_exit();
 
        cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest);
 
index f3d3183..158d309 100644 (file)
@@ -17,6 +17,7 @@
 
 #include <asm/asm-prototypes.h>
 #include <asm/cputable.h>
+#include <asm/interrupt.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
 #include <asm/archrandom.h>
index 30dfeac..e7219b6 100644 (file)
@@ -1813,9 +1813,9 @@ int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
                return -EINVAL;
 
        if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL)
-               state->asserted = 1;
+               state->asserted = true;
        else if (level == 0 || level == KVM_INTERRUPT_UNSET) {
-               state->asserted = 0;
+               state->asserted = false;
                return 0;
        }
 
index f38ae3e..7d5fe43 100644 (file)
@@ -20,6 +20,7 @@
 
 #include <asm/cputable.h>
 #include <linux/uaccess.h>
+#include <asm/interrupt.h>
 #include <asm/kvm_ppc.h>
 #include <asm/cacheflush.h>
 #include <asm/dbell.h>
index 6c083a9..a2a68a9 100644 (file)
@@ -1522,7 +1522,7 @@ int kvmppc_handle_vmx_load(struct kvm_vcpu *vcpu,
        return emulated;
 }
 
-int kvmppc_get_vmx_dword(struct kvm_vcpu *vcpu, int index, u64 *val)
+static int kvmppc_get_vmx_dword(struct kvm_vcpu *vcpu, int index, u64 *val)
 {
        union kvmppc_one_reg reg;
        int vmx_offset = 0;
@@ -1540,7 +1540,7 @@ int kvmppc_get_vmx_dword(struct kvm_vcpu *vcpu, int index, u64 *val)
        return result;
 }
 
-int kvmppc_get_vmx_word(struct kvm_vcpu *vcpu, int index, u64 *val)
+static int kvmppc_get_vmx_word(struct kvm_vcpu *vcpu, int index, u64 *val)
 {
        union kvmppc_one_reg reg;
        int vmx_offset = 0;
@@ -1558,7 +1558,7 @@ int kvmppc_get_vmx_word(struct kvm_vcpu *vcpu, int index, u64 *val)
        return result;
 }
 
-int kvmppc_get_vmx_hword(struct kvm_vcpu *vcpu, int index, u64 *val)
+static int kvmppc_get_vmx_hword(struct kvm_vcpu *vcpu, int index, u64 *val)
 {
        union kvmppc_one_reg reg;
        int vmx_offset = 0;
@@ -1576,7 +1576,7 @@ int kvmppc_get_vmx_hword(struct kvm_vcpu *vcpu, int index, u64 *val)
        return result;
 }
 
-int kvmppc_get_vmx_byte(struct kvm_vcpu *vcpu, int index, u64 *val)
+static int kvmppc_get_vmx_byte(struct kvm_vcpu *vcpu, int index, u64 *val)
 {
        union kvmppc_one_reg reg;
        int vmx_offset = 0;
index 1550e0d..eb2919d 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/string.h>
 #include <linux/export.h>
 #include <linux/uaccess.h>
+#include <linux/libnvdimm.h>
 
 #include <asm/cacheflush.h>
 
index ede093e..bb5c20d 100644 (file)
@@ -1306,9 +1306,11 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                if ((word & 0xfe2) == 2)
                        op->type = SYSCALL;
                else if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) &&
-                               (word & 0xfe3) == 1)
+                               (word & 0xfe3) == 1) {  /* scv */
                        op->type = SYSCALL_VECTORED_0;
-               else
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
+               } else
                        op->type = UNKNOWN;
                return 0;
 #endif
@@ -1412,7 +1414,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 #ifdef __powerpc64__
        case 1:
                if (!cpu_has_feature(CPU_FTR_ARCH_31))
-                       return -1;
+                       goto unknown_opcode;
 
                prefix_r = GET_PREFIX_R(word);
                ra = GET_PREFIX_RA(suffix);
@@ -1445,8 +1447,13 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 
 #ifdef __powerpc64__
        case 4:
+               /*
+                * There are very many instructions with this primary opcode
+                * introduced in the ISA as early as v2.03. However, the ones
+                * we currently emulate were all introduced with ISA 3.0
+                */
                if (!cpu_has_feature(CPU_FTR_ARCH_300))
-                       return -1;
+                       goto unknown_opcode;
 
                switch (word & 0x3f) {
                case 48:        /* maddhd */
@@ -1472,7 +1479,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                 * There are other instructions from ISA 3.0 with the same
                 * primary opcode which do not have emulation support yet.
                 */
-               return -1;
+               goto unknown_opcode;
 #endif
 
        case 7:         /* mulli */
@@ -1532,6 +1539,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
        case 19:
                if (((word >> 1) & 0x1f) == 2) {
                        /* addpcis */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
                        imm = (short) (word & 0xffc1);  /* d0 + d2 fields */
                        imm |= (word >> 15) & 0x3e;     /* d1 field */
                        op->val = regs->nip + (imm << 16) + 4;
@@ -1844,7 +1853,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 #ifdef __powerpc64__
                case 265:       /* modud */
                        if (!cpu_has_feature(CPU_FTR_ARCH_300))
-                               return -1;
+                               goto unknown_opcode;
                        op->val = regs->gpr[ra] % regs->gpr[rb];
                        goto compute_done;
 #endif
@@ -1854,7 +1863,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 
                case 267:       /* moduw */
                        if (!cpu_has_feature(CPU_FTR_ARCH_300))
-                               return -1;
+                               goto unknown_opcode;
                        op->val = (unsigned int) regs->gpr[ra] %
                                (unsigned int) regs->gpr[rb];
                        goto compute_done;
@@ -1891,7 +1900,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 #endif
                case 755:       /* darn */
                        if (!cpu_has_feature(CPU_FTR_ARCH_300))
-                               return -1;
+                               goto unknown_opcode;
                        switch (ra & 0x3) {
                        case 0:
                                /* 32-bit conditioned */
@@ -1909,18 +1918,18 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                                goto compute_done;
                        }
 
-                       return -1;
+                       goto unknown_opcode;
 #ifdef __powerpc64__
                case 777:       /* modsd */
                        if (!cpu_has_feature(CPU_FTR_ARCH_300))
-                               return -1;
+                               goto unknown_opcode;
                        op->val = (long int) regs->gpr[ra] %
                                (long int) regs->gpr[rb];
                        goto compute_done;
 #endif
                case 779:       /* modsw */
                        if (!cpu_has_feature(CPU_FTR_ARCH_300))
-                               return -1;
+                               goto unknown_opcode;
                        op->val = (int) regs->gpr[ra] %
                                (int) regs->gpr[rb];
                        goto compute_done;
@@ -1997,14 +2006,14 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 #endif
                case 538:       /* cnttzw */
                        if (!cpu_has_feature(CPU_FTR_ARCH_300))
-                               return -1;
+                               goto unknown_opcode;
                        val = (unsigned int) regs->gpr[rd];
                        op->val = (val ? __builtin_ctz(val) : 32);
                        goto logical_done;
 #ifdef __powerpc64__
                case 570:       /* cnttzd */
                        if (!cpu_has_feature(CPU_FTR_ARCH_300))
-                               return -1;
+                               goto unknown_opcode;
                        val = regs->gpr[rd];
                        op->val = (val ? __builtin_ctzl(val) : 64);
                        goto logical_done;
@@ -2114,7 +2123,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                case 890:       /* extswsli with sh_5 = 0 */
                case 891:       /* extswsli with sh_5 = 1 */
                        if (!cpu_has_feature(CPU_FTR_ARCH_300))
-                               return -1;
+                               goto unknown_opcode;
                        op->type = COMPUTE + SETREG;
                        sh = rb | ((word & 2) << 4);
                        val = (signed int) regs->gpr[rd];
@@ -2441,6 +2450,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                        break;
 
                case 268:       /* lxvx */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
                        op->reg = rd | ((word & 1) << 5);
                        op->type = MKOP(LOAD_VSX, 0, 16);
                        op->element_size = 16;
@@ -2450,6 +2461,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                case 269:       /* lxvl */
                case 301: {     /* lxvll */
                        int nb;
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
                        op->reg = rd | ((word & 1) << 5);
                        op->ea = ra ? regs->gpr[ra] : 0;
                        nb = regs->gpr[rb] & 0xff;
@@ -2470,13 +2483,15 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 
                case 333:       /* lxvpx */
                        if (!cpu_has_feature(CPU_FTR_ARCH_31))
-                               return -1;
+                               goto unknown_opcode;
                        op->reg = VSX_REGISTER_XTP(rd);
                        op->type = MKOP(LOAD_VSX, 0, 32);
                        op->element_size = 32;
                        break;
 
                case 364:       /* lxvwsx */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
                        op->reg = rd | ((word & 1) << 5);
                        op->type = MKOP(LOAD_VSX, 0, 4);
                        op->element_size = 4;
@@ -2484,6 +2499,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                        break;
 
                case 396:       /* stxvx */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
                        op->reg = rd | ((word & 1) << 5);
                        op->type = MKOP(STORE_VSX, 0, 16);
                        op->element_size = 16;
@@ -2493,6 +2510,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                case 397:       /* stxvl */
                case 429: {     /* stxvll */
                        int nb;
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
                        op->reg = rd | ((word & 1) << 5);
                        op->ea = ra ? regs->gpr[ra] : 0;
                        nb = regs->gpr[rb] & 0xff;
@@ -2506,7 +2525,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                }
                case 461:       /* stxvpx */
                        if (!cpu_has_feature(CPU_FTR_ARCH_31))
-                               return -1;
+                               goto unknown_opcode;
                        op->reg = VSX_REGISTER_XTP(rd);
                        op->type = MKOP(STORE_VSX, 0, 32);
                        op->element_size = 32;
@@ -2544,6 +2563,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                        break;
 
                case 781:       /* lxsibzx */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
                        op->reg = rd | ((word & 1) << 5);
                        op->type = MKOP(LOAD_VSX, 0, 1);
                        op->element_size = 8;
@@ -2551,6 +2572,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                        break;
 
                case 812:       /* lxvh8x */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
                        op->reg = rd | ((word & 1) << 5);
                        op->type = MKOP(LOAD_VSX, 0, 16);
                        op->element_size = 2;
@@ -2558,6 +2581,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                        break;
 
                case 813:       /* lxsihzx */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
                        op->reg = rd | ((word & 1) << 5);
                        op->type = MKOP(LOAD_VSX, 0, 2);
                        op->element_size = 8;
@@ -2571,6 +2596,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                        break;
 
                case 876:       /* lxvb16x */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
                        op->reg = rd | ((word & 1) << 5);
                        op->type = MKOP(LOAD_VSX, 0, 16);
                        op->element_size = 1;
@@ -2584,6 +2611,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                        break;
 
                case 909:       /* stxsibx */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
                        op->reg = rd | ((word & 1) << 5);
                        op->type = MKOP(STORE_VSX, 0, 1);
                        op->element_size = 8;
@@ -2591,6 +2620,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                        break;
 
                case 940:       /* stxvh8x */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
                        op->reg = rd | ((word & 1) << 5);
                        op->type = MKOP(STORE_VSX, 0, 16);
                        op->element_size = 2;
@@ -2598,6 +2629,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                        break;
 
                case 941:       /* stxsihx */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
                        op->reg = rd | ((word & 1) << 5);
                        op->type = MKOP(STORE_VSX, 0, 2);
                        op->element_size = 8;
@@ -2611,6 +2644,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                        break;
 
                case 1004:      /* stxvb16x */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
                        op->reg = rd | ((word & 1) << 5);
                        op->type = MKOP(STORE_VSX, 0, 16);
                        op->element_size = 1;
@@ -2719,12 +2754,16 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                        op->type = MKOP(LOAD_FP, 0, 16);
                        break;
                case 2:         /* lxsd */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
                        op->reg = rd + 32;
                        op->type = MKOP(LOAD_VSX, 0, 8);
                        op->element_size = 8;
                        op->vsx_flags = VSX_CHECK_VEC;
                        break;
                case 3:         /* lxssp */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
                        op->reg = rd + 32;
                        op->type = MKOP(LOAD_VSX, 0, 4);
                        op->element_size = 8;
@@ -2754,7 +2793,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 #ifdef CONFIG_VSX
        case 6:
                if (!cpu_has_feature(CPU_FTR_ARCH_31))
-                       return -1;
+                       goto unknown_opcode;
                op->ea = dqform_ea(word, regs);
                op->reg = VSX_REGISTER_XTP(rd);
                op->element_size = 32;
@@ -2777,6 +2816,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                        break;
 
                case 1:         /* lxv */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
                        op->ea = dqform_ea(word, regs);
                        if (word & 8)
                                op->reg = rd + 32;
@@ -2787,6 +2828,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 
                case 2:         /* stxsd with LSB of DS field = 0 */
                case 6:         /* stxsd with LSB of DS field = 1 */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
                        op->ea = dsform_ea(word, regs);
                        op->reg = rd + 32;
                        op->type = MKOP(STORE_VSX, 0, 8);
@@ -2796,6 +2839,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 
                case 3:         /* stxssp with LSB of DS field = 0 */
                case 7:         /* stxssp with LSB of DS field = 1 */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
                        op->ea = dsform_ea(word, regs);
                        op->reg = rd + 32;
                        op->type = MKOP(STORE_VSX, 0, 4);
@@ -2804,6 +2849,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                        break;
 
                case 5:         /* stxv */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               goto unknown_opcode;
                        op->ea = dqform_ea(word, regs);
                        if (word & 8)
                                op->reg = rd + 32;
@@ -2833,7 +2880,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                break;
        case 1: /* Prefixed instructions */
                if (!cpu_has_feature(CPU_FTR_ARCH_31))
-                       return -1;
+                       goto unknown_opcode;
 
                prefix_r = GET_PREFIX_R(word);
                ra = GET_PREFIX_RA(suffix);
@@ -2972,6 +3019,20 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 
        }
 
+       if (OP_IS_LOAD_STORE(op->type) && (op->type & UPDATE)) {
+               switch (GETTYPE(op->type)) {
+               case LOAD:
+                       if (ra == rd)
+                               goto unknown_opcode;
+                       fallthrough;
+               case STORE:
+               case LOAD_FP:
+               case STORE_FP:
+                       if (ra == 0)
+                               goto unknown_opcode;
+               }
+       }
+
 #ifdef CONFIG_VSX
        if ((GETTYPE(op->type) == LOAD_VSX ||
             GETTYPE(op->type) == STORE_VSX) &&
@@ -2982,6 +3043,10 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 
        return 0;
 
+ unknown_opcode:
+       op->type = UNKNOWN;
+       return 0;
+
  logical_done:
        if (word & 1)
                set_cr0(regs, op);
index 3f972db..446d9de 100644 (file)
@@ -6,4 +6,6 @@ ifdef CONFIG_KASAN
 CFLAGS_mmu.o           += -DDISABLE_BRANCH_PROFILING
 endif
 
-obj-y += mmu.o hash_low.o mmu_context.o tlb.o nohash_low.o
+obj-y += mmu.o mmu_context.o
+obj-$(CONFIG_PPC_BOOK3S_603) += nohash_low.o
+obj-$(CONFIG_PPC_BOOK3S_604) += hash_low.o tlb.o
index 859e5bd..d7eb266 100644 (file)
@@ -234,7 +234,7 @@ void mmu_mark_initmem_nx(void)
                if (is_module_segment(i << 28))
                        continue;
 
-               mtsrin(mfsrin(i << 28) | 0x10000000, i << 28);
+               mtsr(mfsr(i << 28) | 0x10000000, i << 28);
        }
 }
 
index b5e9fff..a688e13 100644 (file)
 unsigned int hpage_shift;
 EXPORT_SYMBOL(hpage_shift);
 
-extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
-                                 unsigned long pa, unsigned long rlags,
-                                 unsigned long vflags, int psize, int ssize);
-
 int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
                     pte_t *ptep, unsigned long trap, unsigned long flags,
                     int ssize, unsigned int shift, unsigned int mmu_psize)
index 73b06ad..581b20a 100644 (file)
@@ -38,6 +38,7 @@
 #include <linux/pgtable.h>
 
 #include <asm/debugfs.h>
+#include <asm/interrupt.h>
 #include <asm/processor.h>
 #include <asm/mmu.h>
 #include <asm/mmu_context.h>
@@ -1143,10 +1144,10 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
        page = pte_page(pte);
 
        /* page is dirty */
-       if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
+       if (!test_bit(PG_dcache_clean, &page->flags) && !PageReserved(page)) {
                if (trap == 0x400) {
                        flush_dcache_icache_page(page);
-                       set_bit(PG_arch_1, &page->flags);
+                       set_bit(PG_dcache_clean, &page->flags);
                } else
                        pp |= HPTE_R_N;
        }
@@ -1288,7 +1289,6 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
                 unsigned long flags)
 {
        bool is_thp;
-       enum ctx_state prev_state = exception_enter();
        pgd_t *pgdir;
        unsigned long vsid;
        pte_t *ptep;
@@ -1490,7 +1490,6 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
        DBG_LOW(" -> rc=%d\n", rc);
 
 bail:
-       exception_exit(prev_state);
        return rc;
 }
 EXPORT_SYMBOL_GPL(hash_page_mm);
@@ -1512,16 +1511,22 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap,
 }
 EXPORT_SYMBOL_GPL(hash_page);
 
-int __hash_page(unsigned long trap, unsigned long ea, unsigned long dsisr,
-               unsigned long msr)
+DECLARE_INTERRUPT_HANDLER_RET(__do_hash_fault);
+DEFINE_INTERRUPT_HANDLER_RET(__do_hash_fault)
 {
+       unsigned long ea = regs->dar;
+       unsigned long dsisr = regs->dsisr;
        unsigned long access = _PAGE_PRESENT | _PAGE_READ;
        unsigned long flags = 0;
-       struct mm_struct *mm = current->mm;
-       unsigned int region_id = get_region_id(ea);
+       struct mm_struct *mm;
+       unsigned int region_id;
+       long err;
 
+       region_id = get_region_id(ea);
        if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID))
                mm = &init_mm;
+       else
+               mm = current->mm;
 
        if (dsisr & DSISR_NOHPTE)
                flags |= HPTE_NOHPTE_UPDATE;
@@ -1537,13 +1542,66 @@ int __hash_page(unsigned long trap, unsigned long ea, unsigned long dsisr,
         * 2) user space access kernel space.
         */
        access |= _PAGE_PRIVILEGED;
-       if ((msr & MSR_PR) || (region_id == USER_REGION_ID))
+       if (user_mode(regs) || (region_id == USER_REGION_ID))
                access &= ~_PAGE_PRIVILEGED;
 
-       if (trap == 0x400)
+       if (regs->trap == 0x400)
                access |= _PAGE_EXEC;
 
-       return hash_page_mm(mm, ea, access, trap, flags);
+       err = hash_page_mm(mm, ea, access, regs->trap, flags);
+       if (unlikely(err < 0)) {
+               // failed to instert a hash PTE due to an hypervisor error
+               if (user_mode(regs)) {
+                       if (IS_ENABLED(CONFIG_PPC_SUBPAGE_PROT) && err == -2)
+                               _exception(SIGSEGV, regs, SEGV_ACCERR, ea);
+                       else
+                               _exception(SIGBUS, regs, BUS_ADRERR, ea);
+               } else {
+                       bad_page_fault(regs, SIGBUS);
+               }
+               err = 0;
+       }
+
+       return err;
+}
+
+/*
+ * The _RAW interrupt entry checks for the in_nmi() case before
+ * running the full handler.
+ */
+DEFINE_INTERRUPT_HANDLER_RAW(do_hash_fault)
+{
+       unsigned long dsisr = regs->dsisr;
+       long err;
+
+       if (unlikely(dsisr & (DSISR_BAD_FAULT_64S | DSISR_KEYFAULT)))
+               goto page_fault;
+
+       /*
+        * If we are in an "NMI" (e.g., an interrupt when soft-disabled), then
+        * don't call hash_page, just fail the fault. This is required to
+        * prevent re-entrancy problems in the hash code, namely perf
+        * interrupts hitting while something holds H_PAGE_BUSY, and taking a
+        * hash fault. See the comment in hash_preload().
+        *
+        * We come here as a result of a DSI at a point where we don't want
+        * to call hash_page, such as when we are accessing memory (possibly
+        * user memory) inside a PMU interrupt that occurred while interrupts
+        * were soft-disabled.  We want to invoke the exception handler for
+        * the access, or panic if there isn't a handler.
+        */
+       if (unlikely(in_nmi())) {
+               do_bad_page_fault_segv(regs);
+               return 0;
+       }
+
+       err = __do_hash_fault(regs);
+       if (err) {
+page_fault:
+               err = hash__do_page_fault(regs);
+       }
+
+       return err;
 }
 
 #ifdef CONFIG_PPC_MM_SLICES
@@ -1843,27 +1901,6 @@ void flush_hash_range(unsigned long number, int local)
        }
 }
 
-/*
- * low_hash_fault is called when we the low level hash code failed
- * to instert a PTE due to an hypervisor error
- */
-void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
-{
-       enum ctx_state prev_state = exception_enter();
-
-       if (user_mode(regs)) {
-#ifdef CONFIG_PPC_SUBPAGE_PROT
-               if (rc == -2)
-                       _exception(SIGSEGV, regs, SEGV_ACCERR, address);
-               else
-#endif
-                       _exception(SIGBUS, regs, BUS_ADRERR, address);
-       } else
-               bad_page_fault(regs, address, SIGBUS);
-
-       exception_exit(prev_state);
-}
-
 long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
                           unsigned long pa, unsigned long rflags,
                           unsigned long vflags, int psize, int ssize)
index c12d78e..5045048 100644 (file)
@@ -15,4 +15,6 @@ static inline bool stress_slb(void)
 
 void slb_setup_new_exec(void);
 
+void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush);
+
 #endif /* ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H */
index 685d7bb..cd18e94 100644 (file)
@@ -129,7 +129,8 @@ good_exit:
 
        mutex_lock(&mem_list_mutex);
 
-       list_for_each_entry_rcu(mem2, &mm->context.iommu_group_mem_list, next) {
+       list_for_each_entry_rcu(mem2, &mm->context.iommu_group_mem_list, next,
+                               lockdep_is_held(&mem_list_mutex)) {
                /* Overlap? */
                if ((mem2->ua < (ua + (entries << PAGE_SHIFT))) &&
                                (ua < (mem2->ua +
@@ -289,6 +290,7 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
 {
        struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
 
+       rcu_read_lock();
        list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
                if ((mem->ua <= ua) &&
                                (ua + size <= mem->ua +
@@ -297,6 +299,7 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
                        break;
                }
        }
+       rcu_read_unlock();
 
        return ret;
 }
@@ -327,7 +330,8 @@ struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm,
 
        mutex_lock(&mem_list_mutex);
 
-       list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
+       list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next,
+                               lockdep_is_held(&mem_list_mutex)) {
                if ((mem->ua == ua) && (mem->entries == entries)) {
                        ret = mem;
                        ++mem->used;
@@ -421,6 +425,7 @@ bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
        struct mm_iommu_table_group_mem_t *mem;
        unsigned long end;
 
+       rcu_read_lock();
        list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
                if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
                        continue;
@@ -437,6 +442,7 @@ bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
                        return true;
                }
        }
+       rcu_read_unlock();
 
        return false;
 }
index 5b3a3ba..9ffa650 100644 (file)
@@ -20,6 +20,8 @@
 #include <mm/mmu_decl.h>
 #include <trace/events/thp.h>
 
+#include "internal.h"
+
 unsigned long __pmd_frag_nr;
 EXPORT_SYMBOL(__pmd_frag_nr);
 unsigned long __pmd_frag_size_shift;
@@ -79,10 +81,15 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
        return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
 }
 
-static void do_nothing(void *unused)
+static void do_serialize(void *arg)
 {
-
+       /* We've taken the IPI, so try to trim the mask while here */
+       if (radix_enabled()) {
+               struct mm_struct *mm = arg;
+               exit_lazy_flush_tlb(mm, false);
+       }
 }
+
 /*
  * Serialize against find_current_mm_pte which does lock-less
  * lookup in page tables with local interrupts disabled. For huge pages
@@ -96,7 +103,7 @@ static void do_nothing(void *unused)
 void serialize_against_pte_lookup(struct mm_struct *mm)
 {
        smp_mb();
-       smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
+       smp_call_function_many(mm_cpumask(mm), do_serialize, mm, 1);
 }
 
 /*
index fb66d15..409e612 100644 (file)
@@ -18,6 +18,8 @@
 #include <asm/cputhreads.h>
 #include <asm/plpar_wrappers.h>
 
+#include "internal.h"
+
 #define RIC_FLUSH_TLB 0
 #define RIC_FLUSH_PWC 1
 #define RIC_FLUSH_ALL 2
@@ -627,15 +629,6 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd
 }
 EXPORT_SYMBOL(radix__local_flush_tlb_page);
 
-static bool mm_is_singlethreaded(struct mm_struct *mm)
-{
-       if (atomic_read(&mm->context.copros) > 0)
-               return false;
-       if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm)
-               return true;
-       return false;
-}
-
 static bool mm_needs_flush_escalation(struct mm_struct *mm)
 {
        /*
@@ -648,21 +641,24 @@ static bool mm_needs_flush_escalation(struct mm_struct *mm)
        return false;
 }
 
-#ifdef CONFIG_SMP
-static void do_exit_flush_lazy_tlb(void *arg)
+/*
+ * If always_flush is true, then flush even if this CPU can't be removed
+ * from mm_cpumask.
+ */
+void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush)
 {
-       struct mm_struct *mm = arg;
        unsigned long pid = mm->context.id;
+       int cpu = smp_processor_id();
 
        /*
         * A kthread could have done a mmget_not_zero() after the flushing CPU
-        * checked mm_is_singlethreaded, and be in the process of
-        * kthread_use_mm when interrupted here. In that case, current->mm will
-        * be set to mm, because kthread_use_mm() setting ->mm and switching to
-        * the mm is done with interrupts off.
+        * checked mm_cpumask, and be in the process of kthread_use_mm when
+        * interrupted here. In that case, current->mm will be set to mm,
+        * because kthread_use_mm() setting ->mm and switching to the mm is
+        * done with interrupts off.
         */
        if (current->mm == mm)
-               goto out_flush;
+               goto out;
 
        if (current->active_mm == mm) {
                WARN_ON_ONCE(current->mm != NULL);
@@ -673,11 +669,30 @@ static void do_exit_flush_lazy_tlb(void *arg)
                mmdrop(mm);
        }
 
-       atomic_dec(&mm->context.active_cpus);
-       cpumask_clear_cpu(smp_processor_id(), mm_cpumask(mm));
+       /*
+        * This IPI may be initiated from any source including those not
+        * running the mm, so there may be a racing IPI that comes after
+        * this one which finds the cpumask already clear. Check and avoid
+        * underflowing the active_cpus count in that case. The race should
+        * not otherwise be a problem, but the TLB must be flushed because
+        * that's what the caller expects.
+        */
+       if (cpumask_test_cpu(cpu, mm_cpumask(mm))) {
+               atomic_dec(&mm->context.active_cpus);
+               cpumask_clear_cpu(cpu, mm_cpumask(mm));
+               always_flush = true;
+       }
 
-out_flush:
-       _tlbiel_pid(pid, RIC_FLUSH_ALL);
+out:
+       if (always_flush)
+               _tlbiel_pid(pid, RIC_FLUSH_ALL);
+}
+
+#ifdef CONFIG_SMP
+static void do_exit_flush_lazy_tlb(void *arg)
+{
+       struct mm_struct *mm = arg;
+       exit_lazy_flush_tlb(mm, true);
 }
 
 static void exit_flush_lazy_tlbs(struct mm_struct *mm)
@@ -693,9 +708,110 @@ static void exit_flush_lazy_tlbs(struct mm_struct *mm)
                                (void *)mm, 1);
 }
 
+#else /* CONFIG_SMP */
+static inline void exit_flush_lazy_tlbs(struct mm_struct *mm) { }
+#endif /* CONFIG_SMP */
+
+static DEFINE_PER_CPU(unsigned int, mm_cpumask_trim_clock);
+
+/*
+ * Interval between flushes at which we send out IPIs to check whether the
+ * mm_cpumask can be trimmed for the case where it's not a single-threaded
+ * process flushing its own mm. The intent is to reduce the cost of later
+ * flushes. Don't want this to be so low that it adds noticable cost to TLB
+ * flushing, or so high that it doesn't help reduce global TLBIEs.
+ */
+static unsigned long tlb_mm_cpumask_trim_timer = 1073;
+
+static bool tick_and_test_trim_clock(void)
+{
+       if (__this_cpu_inc_return(mm_cpumask_trim_clock) ==
+                       tlb_mm_cpumask_trim_timer) {
+               __this_cpu_write(mm_cpumask_trim_clock, 0);
+               return true;
+       }
+       return false;
+}
+
+enum tlb_flush_type {
+       FLUSH_TYPE_NONE,
+       FLUSH_TYPE_LOCAL,
+       FLUSH_TYPE_GLOBAL,
+};
+
+static enum tlb_flush_type flush_type_needed(struct mm_struct *mm, bool fullmm)
+{
+       int active_cpus = atomic_read(&mm->context.active_cpus);
+       int cpu = smp_processor_id();
+
+       if (active_cpus == 0)
+               return FLUSH_TYPE_NONE;
+       if (active_cpus == 1 && cpumask_test_cpu(cpu, mm_cpumask(mm))) {
+               if (current->mm != mm) {
+                       /*
+                        * Asynchronous flush sources may trim down to nothing
+                        * if the process is not running, so occasionally try
+                        * to trim.
+                        */
+                       if (tick_and_test_trim_clock()) {
+                               exit_lazy_flush_tlb(mm, true);
+                               return FLUSH_TYPE_NONE;
+                       }
+               }
+               return FLUSH_TYPE_LOCAL;
+       }
+
+       /* Coprocessors require TLBIE to invalidate nMMU. */
+       if (atomic_read(&mm->context.copros) > 0)
+               return FLUSH_TYPE_GLOBAL;
+
+       /*
+        * In the fullmm case there's no point doing the exit_flush_lazy_tlbs
+        * because the mm is being taken down anyway, and a TLBIE tends to
+        * be faster than an IPI+TLBIEL.
+        */
+       if (fullmm)
+               return FLUSH_TYPE_GLOBAL;
+
+       /*
+        * If we are running the only thread of a single-threaded process,
+        * then we should almost always be able to trim off the rest of the
+        * CPU mask (except in the case of use_mm() races), so always try
+        * trimming the mask.
+        */
+       if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm) {
+               exit_flush_lazy_tlbs(mm);
+               /*
+                * use_mm() race could prevent IPIs from being able to clear
+                * the cpumask here, however those users are established
+                * after our first check (and so after the PTEs are removed),
+                * and the TLB still gets flushed by the IPI, so this CPU
+                * will only require a local flush.
+                */
+               return FLUSH_TYPE_LOCAL;
+       }
+
+       /*
+        * Occasionally try to trim down the cpumask. It's possible this can
+        * bring the mask to zero, which results in no flush.
+        */
+       if (tick_and_test_trim_clock()) {
+               exit_flush_lazy_tlbs(mm);
+               if (current->mm == mm)
+                       return FLUSH_TYPE_LOCAL;
+               if (cpumask_test_cpu(cpu, mm_cpumask(mm)))
+                       exit_lazy_flush_tlb(mm, true);
+               return FLUSH_TYPE_NONE;
+       }
+
+       return FLUSH_TYPE_GLOBAL;
+}
+
+#ifdef CONFIG_SMP
 void radix__flush_tlb_mm(struct mm_struct *mm)
 {
        unsigned long pid;
+       enum tlb_flush_type type;
 
        pid = mm->context.id;
        if (unlikely(pid == MMU_NO_CONTEXT))
@@ -703,16 +819,15 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
 
        preempt_disable();
        /*
-        * Order loads of mm_cpumask vs previous stores to clear ptes before
-        * the invalidate. See barrier in switch_mm_irqs_off
+        * Order loads of mm_cpumask (in flush_type_needed) vs previous
+        * stores to clear ptes before the invalidate. See barrier in
+        * switch_mm_irqs_off
         */
        smp_mb();
-       if (!mm_is_thread_local(mm)) {
-               if (unlikely(mm_is_singlethreaded(mm))) {
-                       exit_flush_lazy_tlbs(mm);
-                       goto local;
-               }
-
+       type = flush_type_needed(mm, false);
+       if (type == FLUSH_TYPE_LOCAL) {
+               _tlbiel_pid(pid, RIC_FLUSH_TLB);
+       } else if (type == FLUSH_TYPE_GLOBAL) {
                if (!mmu_has_feature(MMU_FTR_GTSE)) {
                        unsigned long tgt = H_RPTI_TARGET_CMMU;
 
@@ -728,9 +843,6 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
                } else {
                        _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB);
                }
-       } else {
-local:
-               _tlbiel_pid(pid, RIC_FLUSH_TLB);
        }
        preempt_enable();
 }
@@ -739,6 +851,7 @@ EXPORT_SYMBOL(radix__flush_tlb_mm);
 static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
 {
        unsigned long pid;
+       enum tlb_flush_type type;
 
        pid = mm->context.id;
        if (unlikely(pid == MMU_NO_CONTEXT))
@@ -746,13 +859,10 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
 
        preempt_disable();
        smp_mb(); /* see radix__flush_tlb_mm */
-       if (!mm_is_thread_local(mm)) {
-               if (unlikely(mm_is_singlethreaded(mm))) {
-                       if (!fullmm) {
-                               exit_flush_lazy_tlbs(mm);
-                               goto local;
-                       }
-               }
+       type = flush_type_needed(mm, fullmm);
+       if (type == FLUSH_TYPE_LOCAL) {
+               _tlbiel_pid(pid, RIC_FLUSH_ALL);
+       } else if (type == FLUSH_TYPE_GLOBAL) {
                if (!mmu_has_feature(MMU_FTR_GTSE)) {
                        unsigned long tgt = H_RPTI_TARGET_CMMU;
                        unsigned long type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC |
@@ -766,9 +876,6 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
                        _tlbie_pid(pid, RIC_FLUSH_ALL);
                else
                        _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL);
-       } else {
-local:
-               _tlbiel_pid(pid, RIC_FLUSH_ALL);
        }
        preempt_enable();
 }
@@ -783,6 +890,7 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
                                 int psize)
 {
        unsigned long pid;
+       enum tlb_flush_type type;
 
        pid = mm->context.id;
        if (unlikely(pid == MMU_NO_CONTEXT))
@@ -790,11 +898,10 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 
        preempt_disable();
        smp_mb(); /* see radix__flush_tlb_mm */
-       if (!mm_is_thread_local(mm)) {
-               if (unlikely(mm_is_singlethreaded(mm))) {
-                       exit_flush_lazy_tlbs(mm);
-                       goto local;
-               }
+       type = flush_type_needed(mm, false);
+       if (type == FLUSH_TYPE_LOCAL) {
+               _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+       } else if (type == FLUSH_TYPE_GLOBAL) {
                if (!mmu_has_feature(MMU_FTR_GTSE)) {
                        unsigned long tgt, pg_sizes, size;
 
@@ -811,9 +918,6 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
                        _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
                else
                        _tlbiel_va_multicast(mm, vmaddr, pid, psize, RIC_FLUSH_TLB);
-       } else {
-local:
-               _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
        }
        preempt_enable();
 }
@@ -828,8 +932,6 @@ void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 }
 EXPORT_SYMBOL(radix__flush_tlb_page);
 
-#else /* CONFIG_SMP */
-static inline void exit_flush_lazy_tlbs(struct mm_struct *mm) { }
 #endif /* CONFIG_SMP */
 
 static void do_tlbiel_kernel(void *info)
@@ -893,7 +995,9 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
        unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
        unsigned long page_size = 1UL << page_shift;
        unsigned long nr_pages = (end - start) >> page_shift;
-       bool local, full;
+       bool fullmm = (end == TLB_FLUSH_ALL);
+       bool flush_pid;
+       enum tlb_flush_type type;
 
        pid = mm->context.id;
        if (unlikely(pid == MMU_NO_CONTEXT))
@@ -901,24 +1005,18 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
 
        preempt_disable();
        smp_mb(); /* see radix__flush_tlb_mm */
-       if (!mm_is_thread_local(mm)) {
-               if (unlikely(mm_is_singlethreaded(mm))) {
-                       if (end != TLB_FLUSH_ALL) {
-                               exit_flush_lazy_tlbs(mm);
-                               goto is_local;
-                       }
-               }
-               local = false;
-               full = (end == TLB_FLUSH_ALL ||
-                               nr_pages > tlb_single_page_flush_ceiling);
-       } else {
-is_local:
-               local = true;
-               full = (end == TLB_FLUSH_ALL ||
-                               nr_pages > tlb_local_single_page_flush_ceiling);
-       }
+       type = flush_type_needed(mm, fullmm);
+       if (type == FLUSH_TYPE_NONE)
+               goto out;
+
+       if (fullmm)
+               flush_pid = true;
+       else if (type == FLUSH_TYPE_GLOBAL)
+               flush_pid = nr_pages > tlb_single_page_flush_ceiling;
+       else
+               flush_pid = nr_pages > tlb_local_single_page_flush_ceiling;
 
-       if (!mmu_has_feature(MMU_FTR_GTSE) && !local) {
+       if (!mmu_has_feature(MMU_FTR_GTSE) && type == FLUSH_TYPE_GLOBAL) {
                unsigned long tgt = H_RPTI_TARGET_CMMU;
                unsigned long pg_sizes = psize_to_rpti_pgsize(mmu_virtual_psize);
 
@@ -928,8 +1026,8 @@ is_local:
                        tgt |= H_RPTI_TARGET_NMMU;
                pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB, pg_sizes,
                                       start, end);
-       } else if (full) {
-               if (local) {
+       } else if (flush_pid) {
+               if (type == FLUSH_TYPE_LOCAL) {
                        _tlbiel_pid(pid, RIC_FLUSH_TLB);
                } else {
                        if (cputlb_use_tlbie()) {
@@ -952,7 +1050,7 @@ is_local:
                                hflush = true;
                }
 
-               if (local) {
+               if (type == FLUSH_TYPE_LOCAL) {
                        asm volatile("ptesync": : :"memory");
                        __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
                        if (hflush)
@@ -974,6 +1072,7 @@ is_local:
                                        hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, false);
                }
        }
+out:
        preempt_enable();
 }
 
@@ -1085,32 +1184,30 @@ static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
        unsigned int page_shift = mmu_psize_defs[psize].shift;
        unsigned long page_size = 1UL << page_shift;
        unsigned long nr_pages = (end - start) >> page_shift;
-       bool local, full;
+       bool fullmm = (end == TLB_FLUSH_ALL);
+       bool flush_pid;
+       enum tlb_flush_type type;
 
        pid = mm->context.id;
        if (unlikely(pid == MMU_NO_CONTEXT))
                return;
 
+       fullmm = (end == TLB_FLUSH_ALL);
+
        preempt_disable();
        smp_mb(); /* see radix__flush_tlb_mm */
-       if (!mm_is_thread_local(mm)) {
-               if (unlikely(mm_is_singlethreaded(mm))) {
-                       if (end != TLB_FLUSH_ALL) {
-                               exit_flush_lazy_tlbs(mm);
-                               goto is_local;
-                       }
-               }
-               local = false;
-               full = (end == TLB_FLUSH_ALL ||
-                               nr_pages > tlb_single_page_flush_ceiling);
-       } else {
-is_local:
-               local = true;
-               full = (end == TLB_FLUSH_ALL ||
-                               nr_pages > tlb_local_single_page_flush_ceiling);
-       }
+       type = flush_type_needed(mm, fullmm);
+       if (type == FLUSH_TYPE_NONE)
+               goto out;
+
+       if (fullmm)
+               flush_pid = true;
+       else if (type == FLUSH_TYPE_GLOBAL)
+               flush_pid = nr_pages > tlb_single_page_flush_ceiling;
+       else
+               flush_pid = nr_pages > tlb_local_single_page_flush_ceiling;
 
-       if (!mmu_has_feature(MMU_FTR_GTSE) && !local) {
+       if (!mmu_has_feature(MMU_FTR_GTSE) && type == FLUSH_TYPE_GLOBAL) {
                unsigned long tgt = H_RPTI_TARGET_CMMU;
                unsigned long type = H_RPTI_TYPE_TLB;
                unsigned long pg_sizes = psize_to_rpti_pgsize(psize);
@@ -1120,8 +1217,8 @@ is_local:
                if (atomic_read(&mm->context.copros) > 0)
                        tgt |= H_RPTI_TARGET_NMMU;
                pseries_rpt_invalidate(pid, tgt, type, pg_sizes, start, end);
-       } else if (full) {
-               if (local) {
+       } else if (flush_pid) {
+               if (type == FLUSH_TYPE_LOCAL) {
                        _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
                } else {
                        if (cputlb_use_tlbie()) {
@@ -1137,7 +1234,7 @@ is_local:
 
                }
        } else {
-               if (local)
+               if (type == FLUSH_TYPE_LOCAL)
                        _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
                else if (cputlb_use_tlbie())
                        _tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
@@ -1145,6 +1242,7 @@ is_local:
                        _tlbiel_va_range_multicast(mm,
                                        start, end, pid, page_size, psize, also_pwc);
        }
+out:
        preempt_enable();
 }
 
@@ -1164,6 +1262,7 @@ static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long
 void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 {
        unsigned long pid, end;
+       enum tlb_flush_type type;
 
        pid = mm->context.id;
        if (unlikely(pid == MMU_NO_CONTEXT))
@@ -1180,11 +1279,10 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
        /* Otherwise first do the PWC, then iterate the pages. */
        preempt_disable();
        smp_mb(); /* see radix__flush_tlb_mm */
-       if (!mm_is_thread_local(mm)) {
-               if (unlikely(mm_is_singlethreaded(mm))) {
-                       exit_flush_lazy_tlbs(mm);
-                       goto local;
-               }
+       type = flush_type_needed(mm, false);
+       if (type == FLUSH_TYPE_LOCAL) {
+               _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+       } else if (type == FLUSH_TYPE_GLOBAL) {
                if (!mmu_has_feature(MMU_FTR_GTSE)) {
                        unsigned long tgt, type, pg_sizes;
 
@@ -1202,9 +1300,6 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
                else
                        _tlbiel_va_range_multicast(mm,
                                        addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
-       } else {
-local:
-               _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
        }
 
        preempt_enable();
index 5845679..c91bd85 100644 (file)
@@ -10,6 +10,7 @@
  */
 
 #include <asm/asm-prototypes.h>
+#include <asm/interrupt.h>
 #include <asm/mmu.h>
 #include <asm/mmu_context.h>
 #include <asm/paca.h>
@@ -813,8 +814,9 @@ static long slb_allocate_user(struct mm_struct *mm, unsigned long ea)
        return slb_insert_entry(ea, context, flags, ssize, false);
 }
 
-long do_slb_fault(struct pt_regs *regs, unsigned long ea)
+DEFINE_INTERRUPT_HANDLER_RAW(do_slb_fault)
 {
+       unsigned long ea = regs->dar;
        unsigned long id = get_region_id(ea);
 
        /* IRQs are not reconciled here, so can't check irqs_disabled */
@@ -824,19 +826,21 @@ long do_slb_fault(struct pt_regs *regs, unsigned long ea)
                return -EINVAL;
 
        /*
-        * SLB kernel faults must be very careful not to touch anything
-        * that is not bolted. E.g., PACA and global variables are okay,
-        * mm->context stuff is not.
-        *
-        * SLB user faults can access all of kernel memory, but must be
-        * careful not to touch things like IRQ state because it is not
-        * "reconciled" here. The difficulty is that we must use
-        * fast_exception_return to return from kernel SLB faults without
-        * looking at possible non-bolted memory. We could test user vs
-        * kernel faults in the interrupt handler asm and do a full fault,
-        * reconcile, ret_from_except for user faults which would make them
-        * first class kernel code. But for performance it's probably nicer
-        * if they go via fast_exception_return too.
+        * SLB kernel faults must be very careful not to touch anything that is
+        * not bolted. E.g., PACA and global variables are okay, mm->context
+        * stuff is not. SLB user faults may access all of memory (and induce
+        * one recursive SLB kernel fault), so the kernel fault must not
+        * trample on the user fault state at those points.
+        */
+
+       /*
+        * This is a raw interrupt handler, for performance, so that
+        * fast_interrupt_return can be used. The handler must not touch local
+        * irq state, or schedule. We could test for usermode and upgrade to a
+        * normal process context (synchronous) interrupt for those, which
+        * would make them first-class kernel code and able to be traced and
+        * instrumented, although performance would suffer a bit, it would
+        * probably be a good tradeoff.
         */
        if (id >= LINEAR_MAP_REGION_ID) {
                long err;
@@ -865,13 +869,15 @@ long do_slb_fault(struct pt_regs *regs, unsigned long ea)
        }
 }
 
-void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err)
+DEFINE_INTERRUPT_HANDLER(do_bad_slb_fault)
 {
+       int err = regs->result;
+
        if (err == -EFAULT) {
                if (user_mode(regs))
-                       _exception(SIGSEGV, regs, SEGV_BNDERR, ea);
+                       _exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar);
                else
-                       bad_page_fault(regs, ea, SIGSEGV);
+                       bad_page_fault(regs, SIGSEGV);
        } else if (err == -EINVAL) {
                unrecoverable_exception(regs);
        } else {
index 8961b44..bb36825 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/uaccess.h>
 
 #include <asm/firmware.h>
+#include <asm/interrupt.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 #include <asm/mmu_context.h>
@@ -377,18 +378,16 @@ static void sanity_check_fault(bool is_write, bool is_user,
 
 /*
  * For 600- and 800-family processors, the error_code parameter is DSISR
- * for a data fault, SRR1 for an instruction fault. For 400-family processors
- * the error_code parameter is ESR for a data fault, 0 for an instruction
- * fault.
- * For 64-bit processors, the error_code parameter is
- *  - DSISR for a non-SLB data access fault,
- *  - SRR1 & 0x08000000 for a non-SLB instruction access fault
- *  - 0 any SLB fault.
+ * for a data fault, SRR1 for an instruction fault.
+ * For 400-family processors the error_code parameter is ESR for a data fault,
+ * 0 for an instruction fault.
+ * For 64-bit processors, the error_code parameter is DSISR for a data access
+ * fault, SRR1 & 0x08000000 for an instruction access fault.
  *
  * The return value is 0 if the fault was handled, or the signal
  * number if this is a kernel fault that can't be handled here.
  */
-static int __do_page_fault(struct pt_regs *regs, unsigned long address,
+static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
                           unsigned long error_code)
 {
        struct vm_area_struct * vma;
@@ -435,9 +434,7 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
                return bad_area_nosemaphore(regs, address);
        }
 
-       /* We restore the interrupt state now */
-       if (!arch_irq_disabled_regs(regs))
-               local_irq_enable();
+       interrupt_cond_local_irq_enable(regs);
 
        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
@@ -540,34 +537,51 @@ retry:
 
        return 0;
 }
-NOKPROBE_SYMBOL(__do_page_fault);
+NOKPROBE_SYMBOL(___do_page_fault);
 
-int do_page_fault(struct pt_regs *regs, unsigned long address,
-                 unsigned long error_code)
+static long __do_page_fault(struct pt_regs *regs)
 {
        const struct exception_table_entry *entry;
-       enum ctx_state prev_state = exception_enter();
-       int rc = __do_page_fault(regs, address, error_code);
-       exception_exit(prev_state);
-       if (likely(!rc))
-               return 0;
+       long err;
+
+       err = ___do_page_fault(regs, regs->dar, regs->dsisr);
+       if (likely(!err))
+               return err;
 
        entry = search_exception_tables(regs->nip);
-       if (unlikely(!entry))
-               return rc;
+       if (likely(entry)) {
+               instruction_pointer_set(regs, extable_fixup(entry));
+               return 0;
+       } else if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) {
+               __bad_page_fault(regs, err);
+               return 0;
+       } else {
+               /* 32 and 64e handle the bad page fault in asm */
+               return err;
+       }
+}
+NOKPROBE_SYMBOL(__do_page_fault);
 
-       instruction_pointer_set(regs, extable_fixup(entry));
+DEFINE_INTERRUPT_HANDLER_RET(do_page_fault)
+{
+       return __do_page_fault(regs);
+}
 
-       return 0;
+#ifdef CONFIG_PPC_BOOK3S_64
+/* Same as do_page_fault but interrupt entry has already run in do_hash_fault */
+long hash__do_page_fault(struct pt_regs *regs)
+{
+       return __do_page_fault(regs);
 }
-NOKPROBE_SYMBOL(do_page_fault);
+NOKPROBE_SYMBOL(hash__do_page_fault);
+#endif
 
 /*
  * bad_page_fault is called when we have a bad access from the kernel.
  * It is called from the DSI and ISI handlers in head.S and from some
  * of the procedures in traps.c.
  */
-void __bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
+void __bad_page_fault(struct pt_regs *regs, int sig)
 {
        int is_write = page_fault_is_write(regs->dsisr);
 
@@ -605,7 +619,7 @@ void __bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
        die("Kernel access of bad area", regs, sig);
 }
 
-void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
+void bad_page_fault(struct pt_regs *regs, int sig)
 {
        const struct exception_table_entry *entry;
 
@@ -614,5 +628,12 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
        if (entry)
                instruction_pointer_set(regs, extable_fixup(entry));
        else
-               __bad_page_fault(regs, address, sig);
+               __bad_page_fault(regs, sig);
 }
+
+#ifdef CONFIG_PPC_BOOK3S_64
+DEFINE_INTERRUPT_HANDLER(do_bad_page_fault_segv)
+{
+       bad_page_fault(regs, SIGSEGV);
+}
+#endif
index 8b3cc4d..d142b76 100644 (file)
@@ -217,7 +217,7 @@ void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_p
        }
 }
 
-int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
+static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
 {
        struct huge_bootmem_page *m;
        if (nr_gpages == 0)
@@ -663,24 +663,6 @@ static int __init hugetlbpage_init(void)
 
 arch_initcall(hugetlbpage_init);
 
-void flush_dcache_icache_hugepage(struct page *page)
-{
-       int i;
-       void *start;
-
-       BUG_ON(!PageCompound(page));
-
-       for (i = 0; i < compound_nr(page); i++) {
-               if (!PageHighMem(page)) {
-                       __flush_dcache_icache(page_address(page+i));
-               } else {
-                       start = kmap_atomic(page+i);
-                       __flush_dcache_icache(start);
-                       kunmap_atomic(start);
-               }
-       }
-}
-
 void __init gigantic_hugetlb_cma_reserve(void)
 {
        unsigned long order = 0;
index afab328..4e8ce6d 100644 (file)
@@ -91,27 +91,6 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end)
        return -ENODEV;
 }
 
-#define FLUSH_CHUNK_SIZE SZ_1G
-/**
- * flush_dcache_range_chunked(): Write any modified data cache blocks out to
- * memory and invalidate them, in chunks of up to FLUSH_CHUNK_SIZE
- * Does not invalidate the corresponding instruction cache blocks.
- *
- * @start: the start address
- * @stop: the stop address (exclusive)
- * @chunk: the max size of the chunks
- */
-static void flush_dcache_range_chunked(unsigned long start, unsigned long stop,
-                                      unsigned long chunk)
-{
-       unsigned long i;
-
-       for (i = start; i < stop; i += chunk) {
-               flush_dcache_range(i, min(stop, i + chunk));
-               cond_resched();
-       }
-}
-
 int __ref arch_create_linear_mapping(int nid, u64 start, u64 size,
                                     struct mhp_params *params)
 {
@@ -136,7 +115,6 @@ void __ref arch_remove_linear_mapping(u64 start, u64 size)
 
        /* Remove htab bolted mappings for this section of memory */
        start = (unsigned long)__va(start);
-       flush_dcache_range_chunked(start, start + size, FLUSH_CHUNK_SIZE);
 
        mutex_lock(&linear_mapping_mutex);
        ret = remove_section_mapping(start, start + size);
@@ -489,19 +467,35 @@ void flush_dcache_page(struct page *page)
        if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
                return;
        /* avoid an atomic op if possible */
-       if (test_bit(PG_arch_1, &page->flags))
-               clear_bit(PG_arch_1, &page->flags);
+       if (test_bit(PG_dcache_clean, &page->flags))
+               clear_bit(PG_dcache_clean, &page->flags);
 }
 EXPORT_SYMBOL(flush_dcache_page);
 
-void flush_dcache_icache_page(struct page *page)
+static void flush_dcache_icache_hugepage(struct page *page)
 {
-#ifdef CONFIG_HUGETLB_PAGE
-       if (PageCompound(page)) {
-               flush_dcache_icache_hugepage(page);
-               return;
+       int i;
+       void *start;
+
+       BUG_ON(!PageCompound(page));
+
+       for (i = 0; i < compound_nr(page); i++) {
+               if (!PageHighMem(page)) {
+                       __flush_dcache_icache(page_address(page+i));
+               } else {
+                       start = kmap_atomic(page+i);
+                       __flush_dcache_icache(start);
+                       kunmap_atomic(start);
+               }
        }
-#endif
+}
+
+void flush_dcache_icache_page(struct page *page)
+{
+
+       if (PageCompound(page))
+               return flush_dcache_icache_hugepage(page);
+
 #if defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC64)
        /* On 8xx there is no need to kmap since highmem is not supported */
        __flush_dcache_icache(page_address(page));
index 15555c9..3546119 100644 (file)
@@ -26,6 +26,7 @@
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <asm/hugetlb.h>
+#include <asm/pte-walk.h>
 
 static inline int is_exec_fault(void)
 {
@@ -81,9 +82,9 @@ static pte_t set_pte_filter_hash(pte_t pte)
                struct page *pg = maybe_pte_to_page(pte);
                if (!pg)
                        return pte;
-               if (!test_bit(PG_arch_1, &pg->flags)) {
+               if (!test_bit(PG_dcache_clean, &pg->flags)) {
                        flush_dcache_icache_page(pg);
-                       set_bit(PG_arch_1, &pg->flags);
+                       set_bit(PG_dcache_clean, &pg->flags);
                }
        }
        return pte;
@@ -116,13 +117,13 @@ static inline pte_t set_pte_filter(pte_t pte)
                return pte;
 
        /* If the page clean, we move on */
-       if (test_bit(PG_arch_1, &pg->flags))
+       if (test_bit(PG_dcache_clean, &pg->flags))
                return pte;
 
        /* If it's an exec fault, we flush the cache and make it clean */
        if (is_exec_fault()) {
                flush_dcache_icache_page(pg);
-               set_bit(PG_arch_1, &pg->flags);
+               set_bit(PG_dcache_clean, &pg->flags);
                return pte;
        }
 
@@ -161,12 +162,12 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
                goto bail;
 
        /* If the page is already clean, we move on */
-       if (test_bit(PG_arch_1, &pg->flags))
+       if (test_bit(PG_dcache_clean, &pg->flags))
                goto bail;
 
-       /* Clean the page and set PG_arch_1 */
+       /* Clean the page and set PG_dcache_clean */
        flush_dcache_icache_page(pg);
-       set_bit(PG_arch_1, &pg->flags);
+       set_bit(PG_dcache_clean, &pg->flags);
 
  bail:
        return pte_mkexec(pte);
index dde2fe8..565048a 100644 (file)
@@ -10,7 +10,7 @@
 
 static void seg_show(struct seq_file *m, int i)
 {
-       u32 val = mfsrin(i << 28);
+       u32 val = mfsr(i << 28);
 
        seq_printf(m, "0x%01x0000000-0x%01xfffffff ", i, i);
        seq_printf(m, "Kern key %d ", (val >> 30) & 1);
index 869d999..6817331 100644 (file)
@@ -54,6 +54,9 @@ struct cpu_hw_events {
        struct  perf_branch_stack       bhrb_stack;
        struct  perf_branch_entry       bhrb_entries[BHRB_MAX_ENTRIES];
        u64                             ic_init;
+
+       /* Store the PMC values */
+       unsigned long pmcs[MAX_HWEVENTS];
 };
 
 static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
@@ -110,10 +113,6 @@ static inline void perf_read_regs(struct pt_regs *regs)
 {
        regs->result = 0;
 }
-static inline int perf_intr_is_nmi(struct pt_regs *regs)
-{
-       return 0;
-}
 
 static inline int siar_valid(struct pt_regs *regs)
 {
@@ -147,6 +146,17 @@ bool is_sier_available(void)
        return false;
 }
 
+/*
+ * Return PMC value corresponding to the
+ * index passed.
+ */
+unsigned long get_pmcs_ext_regs(int idx)
+{
+       struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
+
+       return cpuhw->pmcs[idx];
+}
+
 static bool regs_use_siar(struct pt_regs *regs)
 {
        /*
@@ -353,15 +363,6 @@ static inline void perf_read_regs(struct pt_regs *regs)
        regs->result = use_siar;
 }
 
-/*
- * If interrupts were soft-disabled when a PMU interrupt occurs, treat
- * it as an NMI.
- */
-static inline int perf_intr_is_nmi(struct pt_regs *regs)
-{
-       return (regs->softe & IRQS_DISABLED);
-}
-
 /*
  * On processors like P7+ that have the SIAR-Valid bit, marked instructions
  * must be sampled only if the SIAR-valid bit is set.
@@ -915,7 +916,7 @@ void perf_event_print_debug(void)
  */
 static int power_check_constraints(struct cpu_hw_events *cpuhw,
                                   u64 event_id[], unsigned int cflags[],
-                                  int n_ev)
+                                  int n_ev, struct perf_event **event)
 {
        unsigned long mask, value, nv;
        unsigned long smasks[MAX_HWEVENTS], svalues[MAX_HWEVENTS];
@@ -938,7 +939,7 @@ static int power_check_constraints(struct cpu_hw_events *cpuhw,
                        event_id[i] = cpuhw->alternatives[i][0];
                }
                if (ppmu->get_constraint(event_id[i], &cpuhw->amasks[i][0],
-                                        &cpuhw->avalues[i][0]))
+                                        &cpuhw->avalues[i][0], event[i]->attr.config1))
                        return -1;
        }
        value = mask = 0;
@@ -973,7 +974,8 @@ static int power_check_constraints(struct cpu_hw_events *cpuhw,
                for (j = 1; j < n_alt[i]; ++j)
                        ppmu->get_constraint(cpuhw->alternatives[i][j],
                                             &cpuhw->amasks[i][j],
-                                            &cpuhw->avalues[i][j]);
+                                            &cpuhw->avalues[i][j],
+                                            event[i]->attr.config1);
        }
 
        /* enumerate all possibilities and see if any will work */
@@ -1391,7 +1393,7 @@ static void power_pmu_enable(struct pmu *pmu)
        memset(&cpuhw->mmcr, 0, sizeof(cpuhw->mmcr));
 
        if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_events, hwc_index,
-                              &cpuhw->mmcr, cpuhw->event)) {
+                              &cpuhw->mmcr, cpuhw->event, ppmu->flags)) {
                /* shouldn't ever get here */
                printk(KERN_ERR "oops compute_mmcr failed\n");
                goto out;
@@ -1579,7 +1581,7 @@ static int power_pmu_add(struct perf_event *event, int ef_flags)
 
        if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1))
                goto out;
-       if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1))
+       if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1, cpuhw->event))
                goto out;
        event->hw.config = cpuhw->events[n0];
 
@@ -1789,7 +1791,7 @@ static int power_pmu_commit_txn(struct pmu *pmu)
        n = cpuhw->n_events;
        if (check_excludes(cpuhw->event, cpuhw->flags, 0, n))
                return -EAGAIN;
-       i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n);
+       i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n, cpuhw->event);
        if (i < 0)
                return -EAGAIN;
 
@@ -2027,7 +2029,7 @@ static int power_pmu_event_init(struct perf_event *event)
        local_irq_save(irq_flags);
        cpuhw = this_cpu_ptr(&cpu_hw_events);
 
-       err = power_check_constraints(cpuhw, events, cflags, n + 1);
+       err = power_check_constraints(cpuhw, events, cflags, n + 1, ctrs);
 
        if (has_branch_stack(event)) {
                u64 bhrb_filter = -1;
@@ -2149,7 +2151,17 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
                        left += period;
                        if (left <= 0)
                                left = period;
-                       record = siar_valid(regs);
+
+                       /*
+                        * If address is not requested in the sample via
+                        * PERF_SAMPLE_IP, just record that sample irrespective
+                        * of SIAR valid check.
+                        */
+                       if (event->attr.sample_type & PERF_SAMPLE_IP)
+                               record = siar_valid(regs);
+                       else
+                               record = 1;
+
                        event->hw.last_period = event->hw.sample_period;
                }
                if (left < 0x80000000LL)
@@ -2167,9 +2179,10 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
         * MMCR2. Check attr.exclude_kernel and address to drop the sample in
         * these cases.
         */
-       if (event->attr.exclude_kernel && record)
-               if (is_kernel_addr(mfspr(SPRN_SIAR)))
-                       record = 0;
+       if (event->attr.exclude_kernel &&
+           (event->attr.sample_type & PERF_SAMPLE_IP) &&
+           is_kernel_addr(mfspr(SPRN_SIAR)))
+               record = 0;
 
        /*
         * Finally record data if requested.
@@ -2277,9 +2290,7 @@ static void __perf_event_interrupt(struct pt_regs *regs)
        int i, j;
        struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
        struct perf_event *event;
-       unsigned long val[8];
        int found, active;
-       int nmi;
 
        if (cpuhw->n_limited)
                freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
@@ -2287,26 +2298,14 @@ static void __perf_event_interrupt(struct pt_regs *regs)
 
        perf_read_regs(regs);
 
-       /*
-        * If perf interrupts hit in a local_irq_disable (soft-masked) region,
-        * we consider them as NMIs. This is required to prevent hash faults on
-        * user addresses when reading callchains. See the NMI test in
-        * do_hash_page.
-        */
-       nmi = perf_intr_is_nmi(regs);
-       if (nmi)
-               nmi_enter();
-       else
-               irq_enter();
-
        /* Read all the PMCs since we'll need them a bunch of times */
        for (i = 0; i < ppmu->n_counter; ++i)
-               val[i] = read_pmc(i + 1);
+               cpuhw->pmcs[i] = read_pmc(i + 1);
 
        /* Try to find what caused the IRQ */
        found = 0;
        for (i = 0; i < ppmu->n_counter; ++i) {
-               if (!pmc_overflow(val[i]))
+               if (!pmc_overflow(cpuhw->pmcs[i]))
                        continue;
                if (is_limited_pmc(i + 1))
                        continue; /* these won't generate IRQs */
@@ -2321,7 +2320,7 @@ static void __perf_event_interrupt(struct pt_regs *regs)
                        event = cpuhw->event[j];
                        if (event->hw.idx == (i + 1)) {
                                active = 1;
-                               record_and_restart(event, val[i], regs);
+                               record_and_restart(event, cpuhw->pmcs[i], regs);
                                break;
                        }
                }
@@ -2335,17 +2334,17 @@ static void __perf_event_interrupt(struct pt_regs *regs)
                        event = cpuhw->event[i];
                        if (!event->hw.idx || is_limited_pmc(event->hw.idx))
                                continue;
-                       if (pmc_overflow_power7(val[event->hw.idx - 1])) {
+                       if (pmc_overflow_power7(cpuhw->pmcs[event->hw.idx - 1])) {
                                /* event has overflowed in a buggy way*/
                                found = 1;
                                record_and_restart(event,
-                                                  val[event->hw.idx - 1],
+                                                  cpuhw->pmcs[event->hw.idx - 1],
                                                   regs);
                        }
                }
        }
-       if (!found && !nmi && printk_ratelimit())
-               printk(KERN_WARNING "Can't find PMC that caused IRQ\n");
+       if (unlikely(!found) && !arch_irq_disabled_regs(regs))
+               printk_ratelimited(KERN_WARNING "Can't find PMC that caused IRQ\n");
 
        /*
         * Reset MMCR0 to its normal value.  This will set PMXE and
@@ -2356,10 +2355,9 @@ static void __perf_event_interrupt(struct pt_regs *regs)
         */
        write_mmcr0(cpuhw, cpuhw->mmcr.mmcr0);
 
-       if (nmi)
-               nmi_exit();
-       else
-               irq_exit();
+       /* Clear the cpuhw->pmcs */
+       memset(&cpuhw->pmcs, 0, sizeof(cpuhw->pmcs));
+
 }
 
 static void perf_event_interrupt(struct pt_regs *regs)
index e0e7e27..ee721f4 100644 (file)
@@ -31,19 +31,6 @@ static atomic_t num_events;
 /* Used to avoid races in calling reserve/release_pmc_hardware */
 static DEFINE_MUTEX(pmc_reserve_mutex);
 
-/*
- * If interrupts were soft-disabled when a PMU interrupt occurs, treat
- * it as an NMI.
- */
-static inline int perf_intr_is_nmi(struct pt_regs *regs)
-{
-#ifdef __powerpc64__
-       return (regs->softe & IRQS_DISABLED);
-#else
-       return 0;
-#endif
-}
-
 static void perf_event_interrupt(struct pt_regs *regs);
 
 /*
@@ -659,13 +646,6 @@ static void perf_event_interrupt(struct pt_regs *regs)
        struct perf_event *event;
        unsigned long val;
        int found = 0;
-       int nmi;
-
-       nmi = perf_intr_is_nmi(regs);
-       if (nmi)
-               nmi_enter();
-       else
-               irq_enter();
 
        for (i = 0; i < ppmu->n_counter; ++i) {
                event = cpuhw->event[i];
@@ -690,11 +670,6 @@ static void perf_event_interrupt(struct pt_regs *regs)
        mtmsr(mfmsr() | MSR_PMM);
        mtpmr(PMRN_PMGC0, PMGC0_PMIE | PMGC0_FCECE);
        isync();
-
-       if (nmi)
-               nmi_exit();
-       else
-               irq_exit();
 }
 
 void hw_perf_event_setup(int cpu)
index 6e7e820..e5eb332 100644 (file)
@@ -764,6 +764,14 @@ static ssize_t catalog_event_len_validate(struct hv_24x7_event_data *event,
        return ev_len;
 }
 
+/*
+ * Return true incase of invalid or dummy events with names like RESERVED*
+ */
+static bool ignore_event(const char *name)
+{
+       return strncmp(name, "RESERVED", 8) == 0;
+}
+
 #define MAX_4K (SIZE_MAX / 4096)
 
 static int create_events_from_catalog(struct attribute ***events_,
@@ -894,6 +902,10 @@ static int create_events_from_catalog(struct attribute ***events_,
 
                name = event_name(event, &nl);
 
+               if (ignore_event(name)) {
+                       junk_events++;
+                       continue;
+               }
                if (event->event_group_record_len == 0) {
                        pr_devel("invalid event %zu (%.*s): group_record_len == 0, skipping\n",
                                        event_idx, nl, name);
@@ -955,6 +967,9 @@ static int create_events_from_catalog(struct attribute ***events_,
                        continue;
 
                name  = event_name(event, &nl);
+               if (ignore_event(name))
+                       continue;
+
                nonce = event_uniq_add(&ev_uniq, name, nl, event->domain);
                ct    = event_data_to_attrs(event_idx, events + event_attr_ct,
                                            event, nonce);
index 6ab5b27..e4f577d 100644 (file)
@@ -108,12 +108,57 @@ static void mmcra_sdar_mode(u64 event, unsigned long *mmcra)
                *mmcra |= MMCRA_SDAR_MODE_TLB;
 }
 
+static u64 p10_thresh_cmp_val(u64 value)
+{
+       int exp = 0;
+       u64 result = value;
+
+       if (!value)
+               return value;
+
+       /*
+        * Incase of P10, thresh_cmp value is not part of raw event code
+        * and provided via attr.config1 parameter. To program threshold in MMCRA,
+        * take a 18 bit number N and shift right 2 places and increment
+        * the exponent E by 1 until the upper 10 bits of N are zero.
+        * Write E to the threshold exponent and write the lower 8 bits of N
+        * to the threshold mantissa.
+        * The max threshold that can be written is 261120.
+        */
+       if (cpu_has_feature(CPU_FTR_ARCH_31)) {
+               if (value > 261120)
+                       value = 261120;
+               while ((64 - __builtin_clzl(value)) > 8) {
+                       exp++;
+                       value >>= 2;
+               }
+
+               /*
+                * Note that it is invalid to write a mantissa with the
+                * upper 2 bits of mantissa being zero, unless the
+                * exponent is also zero.
+                */
+               if (!(value & 0xC0) && exp)
+                       result = 0;
+               else
+                       result = (exp << 8) | value;
+       }
+       return result;
+}
+
 static u64 thresh_cmp_val(u64 value)
 {
+       if (cpu_has_feature(CPU_FTR_ARCH_31))
+               value = p10_thresh_cmp_val(value);
+
+       /*
+        * Since location of threshold compare bits in MMCRA
+        * is different for p8, using different shift value.
+        */
        if (cpu_has_feature(CPU_FTR_ARCH_300))
                return value << p9_MMCRA_THR_CMP_SHIFT;
-
-       return value << MMCRA_THR_CMP_SHIFT;
+       else
+               return value << MMCRA_THR_CMP_SHIFT;
 }
 
 static unsigned long combine_from_event(u64 event)
@@ -141,13 +186,13 @@ static bool is_thresh_cmp_valid(u64 event)
 {
        unsigned int cmp, exp;
 
+       if (cpu_has_feature(CPU_FTR_ARCH_31))
+               return p10_thresh_cmp_val(event) != 0;
+
        /*
         * Check the mantissa upper two bits are not zero, unless the
         * exponent is also zero. See the THRESH_CMP_MANTISSA doc.
-        * Power10: thresh_cmp is replaced by l2_l3 event select.
         */
-       if (cpu_has_feature(CPU_FTR_ARCH_31))
-               return false;
 
        cmp = (event >> EVENT_THR_CMP_SHIFT) & EVENT_THR_CMP_MASK;
        exp = cmp >> 7;
@@ -256,7 +301,7 @@ void isa207_get_mem_weight(u64 *weight)
                *weight = mantissa << (2 * exp);
 }
 
-int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
+int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp, u64 event_config1)
 {
        unsigned int unit, pmc, cache, ebb;
        unsigned long mask, value;
@@ -355,9 +400,11 @@ int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
        }
 
        if (cpu_has_feature(CPU_FTR_ARCH_31)) {
-               if (event_is_threshold(event)) {
+               if (event_is_threshold(event) && is_thresh_cmp_valid(event_config1)) {
                        mask  |= CNST_THRESH_CTL_SEL_MASK;
                        value |= CNST_THRESH_CTL_SEL_VAL(event >> EVENT_THRESH_SHIFT);
+                       mask  |= p10_CNST_THRESH_CMP_MASK;
+                       value |= p10_CNST_THRESH_CMP_VAL(p10_thresh_cmp_val(event_config1));
                }
        } else if (cpu_has_feature(CPU_FTR_ARCH_300))  {
                if (event_is_threshold(event) && is_thresh_cmp_valid(event)) {
@@ -411,7 +458,7 @@ ebb_bhrb:
 
 int isa207_compute_mmcr(u64 event[], int n_ev,
                               unsigned int hwc[], struct mmcr_regs *mmcr,
-                              struct perf_event *pevents[])
+                              struct perf_event *pevents[], u32 flags)
 {
        unsigned long mmcra, mmcr1, mmcr2, unit, combine, psel, cache, val;
        unsigned long mmcr3;
@@ -504,6 +551,10 @@ int isa207_compute_mmcr(u64 event[], int n_ev,
                                val = (event[i] >> EVENT_THR_CMP_SHIFT) &
                                        EVENT_THR_CMP_MASK;
                                mmcra |= thresh_cmp_val(val);
+                       } else if (flags & PPMU_HAS_ATTR_CONFIG1) {
+                               val = (pevents[i]->attr.config1 >> p10_EVENT_THR_CMP_SHIFT) &
+                                       p10_EVENT_THR_CMP_MASK;
+                               mmcra |= thresh_cmp_val(val);
                        }
                }
 
index 454b32c..1af0e8c 100644 (file)
 #define p10_EVENT_RADIX_SCOPE_QUAL_MASK        0x1
 #define p10_MMCR1_RADIX_SCOPE_QUAL_SHIFT       45
 
+/* Event Threshold Compare bit constant for power10 in config1 attribute */
+#define p10_EVENT_THR_CMP_SHIFT        0
+#define p10_EVENT_THR_CMP_MASK 0x3FFFFull
+
 #define p10_EVENT_VALID_MASK           \
        ((p10_SDAR_MODE_MASK   << p10_SDAR_MODE_SHIFT           |       \
        (p10_EVENT_THRESH_MASK  << EVENT_THRESH_SHIFT)          |       \
  *        60        56        52        48        44        40        36        32
  * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
  *   [   fab_match   ]         [       thresh_cmp      ] [   thresh_ctl    ] [   ]
- *                                                                             |
- *                                                                 thresh_sel -*
+ *                                          |                                  |
+ *                           [  thresh_cmp bits for p10]           thresh_sel -*
  *
  *        28        24        20        16        12         8         4         0
  * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
 #define CNST_THRESH_CTL_SEL_VAL(v)     (((v) & 0x7ffull) << 32)
 #define CNST_THRESH_CTL_SEL_MASK       CNST_THRESH_CTL_SEL_VAL(0x7ff)
 
+#define p10_CNST_THRESH_CMP_VAL(v) (((v) & 0x7ffull) << 43)
+#define p10_CNST_THRESH_CMP_MASK   p10_CNST_THRESH_CMP_VAL(0x7ff)
+
 #define CNST_EBB_VAL(v)                (((v) & EVENT_EBB_MASK) << 24)
 #define CNST_EBB_MASK          CNST_EBB_VAL(EVENT_EBB_MASK)
 
 #define PH(a, b)                       (P(LVL, HIT) | P(a, b))
 #define PM(a, b)                       (P(LVL, MISS) | P(a, b))
 
-int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp);
+int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp, u64 event_config1);
 int isa207_compute_mmcr(u64 event[], int n_ev,
                                unsigned int hwc[], struct mmcr_regs *mmcr,
-                               struct perf_event *pevents[]);
+                               struct perf_event *pevents[], u32 flags);
 void isa207_disable_pmc(unsigned int pmc, struct mmcr_regs *mmcr);
 int isa207_get_alternatives(u64 event, u64 alt[], int size, unsigned int flags,
                                        const unsigned int ev_alt[][MAX_ALT]);
index 1919e9d..e39b15b 100644 (file)
@@ -148,7 +148,7 @@ static u32 classbits[N_CLASSES - 1][2] = {
 };
 
 static int mpc7450_get_constraint(u64 event, unsigned long *maskp,
-                                 unsigned long *valp)
+                                 unsigned long *valp, u64 event_config1 __maybe_unused)
 {
        int pmc, class;
        u32 mask, value;
@@ -258,7 +258,8 @@ static const u32 pmcsel_mask[N_COUNTER] = {
  */
 static int mpc7450_compute_mmcr(u64 event[], int n_ev, unsigned int hwc[],
                                struct mmcr_regs *mmcr,
-                               struct perf_event *pevents[])
+                               struct perf_event *pevents[],
+                               u32 flags __maybe_unused)
 {
        u8 event_index[N_CLASSES][N_COUNTER];
        int n_classevent[N_CLASSES];
index 6f681b1..b931eed 100644 (file)
@@ -75,6 +75,8 @@ static unsigned int pt_regs_offset[PERF_REG_POWERPC_MAX] = {
 static u64 get_ext_regs_value(int idx)
 {
        switch (idx) {
+       case PERF_REG_POWERPC_PMC1 ... PERF_REG_POWERPC_PMC6:
+               return get_pmcs_ext_regs(idx - PERF_REG_POWERPC_PMC1);
        case PERF_REG_POWERPC_MMCR0:
                return mfspr(SPRN_MMCR0);
        case PERF_REG_POWERPC_MMCR1:
@@ -95,13 +97,6 @@ static u64 get_ext_regs_value(int idx)
 
 u64 perf_reg_value(struct pt_regs *regs, int idx)
 {
-       u64 perf_reg_extended_max = PERF_REG_POWERPC_MAX;
-
-       if (cpu_has_feature(CPU_FTR_ARCH_31))
-               perf_reg_extended_max = PERF_REG_MAX_ISA_31;
-       else if (cpu_has_feature(CPU_FTR_ARCH_300))
-               perf_reg_extended_max = PERF_REG_MAX_ISA_300;
-
        if (idx == PERF_REG_POWERPC_SIER &&
           (IS_ENABLED(CONFIG_FSL_EMB_PERF_EVENT) ||
            IS_ENABLED(CONFIG_PPC32) ||
@@ -113,14 +108,14 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
            IS_ENABLED(CONFIG_PPC32)))
                return 0;
 
-       if (idx >= PERF_REG_POWERPC_MAX && idx < perf_reg_extended_max)
+       if (idx >= PERF_REG_POWERPC_MAX && idx < PERF_REG_EXTENDED_MAX)
                return get_ext_regs_value(idx);
 
        /*
         * If the idx is referring to value beyond the
         * supported registers, return 0 with a warning
         */
-       if (WARN_ON_ONCE(idx >= perf_reg_extended_max))
+       if (WARN_ON_ONCE(idx >= PERF_REG_EXTENDED_MAX))
                return 0;
 
        return regs_get_register(regs, pt_regs_offset[idx]);
index 79e0206..a901c13 100644 (file)
@@ -216,6 +216,7 @@ PMU_FORMAT_ATTR(invert_bit,     "config:47");
 PMU_FORMAT_ATTR(src_mask,       "config:48-53");
 PMU_FORMAT_ATTR(src_match,      "config:54-59");
 PMU_FORMAT_ATTR(radix_scope,   "config:9");
+PMU_FORMAT_ATTR(thresh_cmp,     "config1:0-17");
 
 static struct attribute *power10_pmu_format_attr[] = {
        &format_attr_event.attr,
@@ -236,6 +237,7 @@ static struct attribute *power10_pmu_format_attr[] = {
        &format_attr_src_mask.attr,
        &format_attr_src_match.attr,
        &format_attr_radix_scope.attr,
+       &format_attr_thresh_cmp.attr,
        NULL,
 };
 
@@ -550,7 +552,7 @@ static struct power_pmu power10_pmu = {
        .get_mem_weight         = isa207_get_mem_weight,
        .disable_pmc            = isa207_disable_pmc,
        .flags                  = PPMU_HAS_SIER | PPMU_ARCH_207S |
-                                 PPMU_ARCH_31,
+                                 PPMU_ARCH_31 | PPMU_HAS_ATTR_CONFIG1,
        .n_generic              = ARRAY_SIZE(power10_generic_events),
        .generic_events         = power10_generic_events,
        .cache_events           = &power10_cache_events,
index 3e64b4a..1873226 100644 (file)
@@ -132,7 +132,7 @@ static unsigned long unit_cons[PM_LASTUNIT+1][2] = {
 };
 
 static int power5p_get_constraint(u64 event, unsigned long *maskp,
-                                 unsigned long *valp)
+                                 unsigned long *valp, u64 event_config1 __maybe_unused)
 {
        int pmc, byte, unit, sh;
        int bit, fmask;
@@ -451,7 +451,8 @@ static int power5p_marked_instr_event(u64 event)
 
 static int power5p_compute_mmcr(u64 event[], int n_ev,
                                unsigned int hwc[], struct mmcr_regs *mmcr,
-                               struct perf_event *pevents[])
+                               struct perf_event *pevents[],
+                               u32 flags __maybe_unused)
 {
        unsigned long mmcr1 = 0;
        unsigned long mmcra = 0;
index 017bb19..cb611c1 100644 (file)
@@ -136,7 +136,7 @@ static unsigned long unit_cons[PM_LASTUNIT+1][2] = {
 };
 
 static int power5_get_constraint(u64 event, unsigned long *maskp,
-                                unsigned long *valp)
+                                unsigned long *valp, u64 event_config1 __maybe_unused)
 {
        int pmc, byte, unit, sh;
        int bit, fmask;
@@ -382,7 +382,8 @@ static int power5_marked_instr_event(u64 event)
 
 static int power5_compute_mmcr(u64 event[], int n_ev,
                               unsigned int hwc[], struct mmcr_regs *mmcr,
-                              struct perf_event *pevents[])
+                              struct perf_event *pevents[],
+                              u32 flags __maybe_unused)
 {
        unsigned long mmcr1 = 0;
        unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS;
index 1899744..69ef382 100644 (file)
@@ -173,7 +173,8 @@ static int power6_marked_instr_event(u64 event)
  * Assign PMC numbers and compute MMCR1 value for a set of events
  */
 static int p6_compute_mmcr(u64 event[], int n_ev,
-                          unsigned int hwc[], struct mmcr_regs *mmcr, struct perf_event *pevents[])
+                          unsigned int hwc[], struct mmcr_regs *mmcr, struct perf_event *pevents[],
+                          u32 flags __maybe_unused)
 {
        unsigned long mmcr1 = 0;
        unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS;
@@ -266,7 +267,7 @@ static int p6_compute_mmcr(u64 event[], int n_ev,
  *     32-34   select field: nest (subunit) event selector
  */
 static int p6_get_constraint(u64 event, unsigned long *maskp,
-                            unsigned long *valp)
+                            unsigned long *valp, u64 event_config1 __maybe_unused)
 {
        int pmc, byte, sh, subunit;
        unsigned long mask = 0, value = 0;
index bacfab1..894c17f 100644 (file)
@@ -81,7 +81,7 @@ enum {
  */
 
 static int power7_get_constraint(u64 event, unsigned long *maskp,
-                                unsigned long *valp)
+                                unsigned long *valp, u64 event_config1 __maybe_unused)
 {
        int pmc, sh, unit;
        unsigned long mask = 0, value = 0;
@@ -245,7 +245,8 @@ static int power7_marked_instr_event(u64 event)
 
 static int power7_compute_mmcr(u64 event[], int n_ev,
                               unsigned int hwc[], struct mmcr_regs *mmcr,
-                              struct perf_event *pevents[])
+                              struct perf_event *pevents[],
+                              u32 flags __maybe_unused)
 {
        unsigned long mmcr1 = 0;
        unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS;
index 7d78df9..1f82637 100644 (file)
@@ -190,7 +190,7 @@ static unsigned long unit_cons[PM_LASTUNIT+1][2] = {
 };
 
 static int p970_get_constraint(u64 event, unsigned long *maskp,
-                              unsigned long *valp)
+                              unsigned long *valp, u64 event_config1 __maybe_unused)
 {
        int pmc, byte, unit, sh, spcsel;
        unsigned long mask = 0, value = 0;
@@ -256,7 +256,8 @@ static int p970_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 
 static int p970_compute_mmcr(u64 event[], int n_ev,
                             unsigned int hwc[], struct mmcr_regs *mmcr,
-                            struct perf_event *pevents[])
+                            struct perf_event *pevents[],
+                            u32 flags __maybe_unused)
 {
        unsigned long mmcr0 = 0, mmcr1 = 0, mmcra = 0;
        unsigned int pmc, unit, byte, psel;
index 78ac6d6..7d41e92 100644 (file)
@@ -206,17 +206,10 @@ config AKEBONO
        select PPC4xx_HSTA_MSI
        select I2C
        select I2C_IBM_IIC
-       select NETDEVICES
-       select ETHERNET
-       select NET_VENDOR_IBM
        select IBM_EMAC_EMAC4 if IBM_EMAC
        select USB if USB_SUPPORT
        select USB_OHCI_HCD_PLATFORM if USB_OHCI_HCD
        select USB_EHCI_HCD_PLATFORM if USB_EHCI_HCD
-       select MMC_SDHCI
-       select MMC_SDHCI_PLTFM
-       select ATA
-       select SATA_AHCI_PLATFORM
        help
          This option enables support for the IBM Akebono (476gtr) evaluation board
 
index 6303fbf..9d030c2 100644 (file)
 
 static void __init mpc5121_ads_setup_arch(void)
 {
-#ifdef CONFIG_PCI
-       struct device_node *np;
-#endif
        printk(KERN_INFO "MPC5121 ADS board from Freescale Semiconductor\n");
        /*
         * cpld regs are needed early
         */
        mpc5121_ads_cpld_map();
 
+       mpc512x_setup_arch();
+}
+
+static void __init mpc5121_ads_setup_pci(void)
+{
 #ifdef CONFIG_PCI
+       struct device_node *np;
+
        for_each_compatible_node(np, "pci", "fsl,mpc5121-pci")
                mpc83xx_add_bridge(np);
 #endif
-
-       mpc512x_setup_arch();
 }
 
 static void __init mpc5121_ads_init_IRQ(void)
@@ -64,6 +66,7 @@ define_machine(mpc5121_ads) {
        .name                   = "MPC5121 ADS",
        .probe                  = mpc5121_ads_probe,
        .setup_arch             = mpc5121_ads_setup_arch,
+       .discover_phbs          = mpc5121_ads_setup_pci,
        .init                   = mpc512x_init,
        .init_IRQ               = mpc5121_ads_init_IRQ,
        .get_irq                = ipic_get_irq,
index 4514a6f..3b7d70d 100644 (file)
@@ -185,8 +185,6 @@ static void __init efika_setup_arch(void)
        /* Map important registers from the internal memory map */
        mpc52xx_map_common_devices();
 
-       efika_pcisetup();
-
 #ifdef CONFIG_PM
        mpc52xx_suspend.board_suspend_prepare = efika_suspend_prepare;
        mpc52xx_pm_init();
@@ -218,6 +216,7 @@ define_machine(efika)
        .name                   = EFIKA_PLATFORM_NAME,
        .probe                  = efika_probe,
        .setup_arch             = efika_setup_arch,
+       .discover_phbs          = efika_pcisetup,
        .init                   = mpc52xx_declare_of_platform_devices,
        .show_cpuinfo           = efika_show_cpuinfo,
        .init_IRQ               = mpc52xx_init_irq,
index 3181aac..04cc973 100644 (file)
@@ -165,8 +165,6 @@ static void __init lite5200_setup_arch(void)
        mpc52xx_suspend.board_resume_finish = lite5200_resume_finish;
        lite5200_pm_init();
 #endif
-
-       mpc52xx_setup_pci();
 }
 
 static const char * const board[] __initconst = {
@@ -187,6 +185,7 @@ define_machine(lite5200) {
        .name           = "lite5200",
        .probe          = lite5200_probe,
        .setup_arch     = lite5200_setup_arch,
+       .discover_phbs  = mpc52xx_setup_pci,
        .init           = mpc52xx_declare_of_platform_devices,
        .init_IRQ       = mpc52xx_init_irq,
        .get_irq        = mpc52xx_get_irq,
index 07c5bc4..efb8bde 100644 (file)
@@ -202,8 +202,6 @@ static void __init media5200_setup_arch(void)
        /* Some mpc5200 & mpc5200b related configuration */
        mpc5200_setup_xlb_arbiter();
 
-       mpc52xx_setup_pci();
-
        np = of_find_matching_node(NULL, mpc5200_gpio_ids);
        gpio = of_iomap(np, 0);
        of_node_put(np);
@@ -244,6 +242,7 @@ define_machine(media5200_platform) {
        .name           = "media5200-platform",
        .probe          = media5200_probe,
        .setup_arch     = media5200_setup_arch,
+       .discover_phbs  = mpc52xx_setup_pci,
        .init           = mpc52xx_declare_of_platform_devices,
        .init_IRQ       = media5200_init_irq,
        .get_irq        = mpc52xx_get_irq,
index 2d01e9b..b9f5675 100644 (file)
@@ -40,8 +40,6 @@ static void __init mpc5200_simple_setup_arch(void)
 
        /* Some mpc5200 & mpc5200b related configuration */
        mpc5200_setup_xlb_arbiter();
-
-       mpc52xx_setup_pci();
 }
 
 /* list of the supported boards */
@@ -73,6 +71,7 @@ define_machine(mpc5200_simple_platform) {
        .name           = "mpc5200-simple-platform",
        .probe          = mpc5200_simple_probe,
        .setup_arch     = mpc5200_simple_setup_arch,
+       .discover_phbs  = mpc52xx_setup_pci,
        .init           = mpc52xx_declare_of_platform_devices,
        .init_IRQ       = mpc52xx_init_irq,
        .get_irq        = mpc52xx_get_irq,
index 05e1947..b91ebeb 100644 (file)
@@ -229,7 +229,7 @@ static irqreturn_t mpc52xx_lpbfifo_irq(int irq, void *dev_id)
        int dma, write, poll_dma;
 
        spin_lock_irqsave(&lpbfifo.lock, flags);
-       ts = get_tbl();
+       ts = mftb();
 
        req = lpbfifo.req;
        if (!req) {
@@ -307,7 +307,7 @@ static irqreturn_t mpc52xx_lpbfifo_irq(int irq, void *dev_id)
        if (irq != 0) /* don't increment on polled case */
                req->irq_count++;
 
-       req->irq_ticks += get_tbl() - ts;
+       req->irq_ticks += mftb() - ts;
        spin_unlock_irqrestore(&lpbfifo.lock, flags);
 
        /* Spinlock is released; it is now safe to call the callback */
@@ -330,7 +330,7 @@ static irqreturn_t mpc52xx_lpbfifo_bcom_irq(int irq, void *dev_id)
        u32 ts;
 
        spin_lock_irqsave(&lpbfifo.lock, flags);
-       ts = get_tbl();
+       ts = mftb();
 
        req = lpbfifo.req;
        if (!req || (req->flags & MPC52XX_LPBFIFO_FLAG_NO_DMA)) {
@@ -361,7 +361,7 @@ static irqreturn_t mpc52xx_lpbfifo_bcom_irq(int irq, void *dev_id)
        lpbfifo.req = NULL;
 
        /* Release the lock before calling out to the callback. */
-       req->irq_ticks += get_tbl() - ts;
+       req->irq_ticks += mftb() - ts;
        spin_unlock_irqrestore(&lpbfifo.lock, flags);
 
        if (req->callback)
index 3fe1a65..0b5b9de 100644 (file)
@@ -171,7 +171,6 @@ static void __init mpc8272_ads_setup_arch(void)
        iounmap(bcsr);
 
        init_ioports();
-       pq2_init_pci();
 
        if (ppc_md.progress)
                ppc_md.progress("mpc8272_ads_setup_arch(), finish", 0);
@@ -205,6 +204,7 @@ define_machine(mpc8272_ads)
        .name = "Freescale MPC8272 ADS",
        .probe = mpc8272_ads_probe,
        .setup_arch = mpc8272_ads_setup_arch,
+       .discover_phbs = pq2_init_pci,
        .init_IRQ = mpc8272_ads_pic_init,
        .get_irq = cpm2_get_irq,
        .calibrate_decr = generic_calibrate_decr,
index 096cc0d..f82f75a 100644 (file)
@@ -123,20 +123,17 @@ int __init pq2ads_pci_init_irq(void)
        np = of_find_compatible_node(NULL, NULL, "fsl,pq2ads-pci-pic");
        if (!np) {
                printk(KERN_ERR "No pci pic node in device tree.\n");
-               of_node_put(np);
                goto out;
        }
 
        irq = irq_of_parse_and_map(np, 0);
        if (!irq) {
                printk(KERN_ERR "No interrupt in pci pic node.\n");
-               of_node_put(np);
-               goto out;
+               goto out_put_node;
        }
 
        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
        if (!priv) {
-               of_node_put(np);
                ret = -ENOMEM;
                goto out_unmap_irq;
        }
@@ -161,17 +158,17 @@ int __init pq2ads_pci_init_irq(void)
        priv->host = host;
        irq_set_handler_data(irq, priv);
        irq_set_chained_handler(irq, pq2ads_pci_irq_demux);
-
-       of_node_put(np);
-       return 0;
+       ret = 0;
+       goto out_put_node;
 
 out_unmap_regs:
        iounmap(priv->regs);
 out_free_kmalloc:
        kfree(priv);
-       of_node_put(np);
 out_unmap_irq:
        irq_dispose_mapping(irq);
+out_put_node:
+       of_node_put(np);
 out:
        return ret;
 }
index a740821..ac9113d 100644 (file)
@@ -150,8 +150,6 @@ static void __init pq2fads_setup_arch(void)
        /* Enable external IRQs */
        clrbits32(&cpm2_immr->im_siu_conf.siu_82xx.sc_siumcr, 0x0c000000);
 
-       pq2_init_pci();
-
        if (ppc_md.progress)
                ppc_md.progress("pq2fads_setup_arch(), finish", 0);
 }
@@ -184,6 +182,7 @@ define_machine(pq2fads)
        .name = "Freescale PQ2FADS",
        .probe = pq2fads_probe,
        .setup_arch = pq2fads_setup_arch,
+       .discover_phbs = pq2_init_pci,
        .init_IRQ = pq2fads_pic_init,
        .get_irq = cpm2_get_irq,
        .calibrate_decr = generic_calibrate_decr,
index 2847487..68061c2 100644 (file)
@@ -44,6 +44,7 @@ define_machine(asp834x) {
        .name                   = "ASP8347E",
        .probe                  = asp834x_probe,
        .setup_arch             = asp834x_setup_arch,
+       .discover_phbs          = mpc83xx_setup_pci,
        .init_IRQ               = mpc83xx_ipic_init_IRQ,
        .get_irq                = ipic_get_irq,
        .restart                = mpc83xx_restart,
index bcdc2c2..108e1e4 100644 (file)
@@ -180,6 +180,7 @@ define_machine(mpc83xx_km) {
        .name           = "mpc83xx-km-platform",
        .probe          = mpc83xx_km_probe,
        .setup_arch     = mpc83xx_km_setup_arch,
+       .discover_phbs  = mpc83xx_setup_pci,
        .init_IRQ       = mpc83xx_ipic_init_IRQ,
        .get_irq        = ipic_get_irq,
        .restart        = mpc83xx_restart,
index a952e91..3285dab 100644 (file)
@@ -132,8 +132,6 @@ void __init mpc83xx_setup_arch(void)
                setbat(-1, va, immrbase, immrsize, PAGE_KERNEL_NCG);
                update_bats();
        }
-
-       mpc83xx_setup_pci();
 }
 
 int machine_check_83xx(struct pt_regs *regs)
index 51426e8..956d438 100644 (file)
@@ -48,6 +48,7 @@ define_machine(mpc830x_rdb) {
        .name                   = "MPC830x RDB",
        .probe                  = mpc830x_rdb_probe,
        .setup_arch             = mpc830x_rdb_setup_arch,
+       .discover_phbs          = mpc83xx_setup_pci,
        .init_IRQ               = mpc83xx_ipic_init_IRQ,
        .get_irq                = ipic_get_irq,
        .restart                = mpc83xx_restart,
index 5ccd57a..3b578f0 100644 (file)
@@ -48,6 +48,7 @@ define_machine(mpc831x_rdb) {
        .name                   = "MPC831x RDB",
        .probe                  = mpc831x_rdb_probe,
        .setup_arch             = mpc831x_rdb_setup_arch,
+       .discover_phbs          = mpc83xx_setup_pci,
        .init_IRQ               = mpc83xx_ipic_init_IRQ,
        .get_irq                = ipic_get_irq,
        .restart                = mpc83xx_restart,
index 6fa5402..850d566 100644 (file)
@@ -101,6 +101,7 @@ define_machine(mpc832x_mds) {
        .name           = "MPC832x MDS",
        .probe          = mpc832x_sys_probe,
        .setup_arch     = mpc832x_sys_setup_arch,
+       .discover_phbs  = mpc83xx_setup_pci,
        .init_IRQ       = mpc83xx_ipic_init_IRQ,
        .get_irq        = ipic_get_irq,
        .restart        = mpc83xx_restart,
index 622c625..b6133a2 100644 (file)
@@ -219,6 +219,7 @@ define_machine(mpc832x_rdb) {
        .name           = "MPC832x RDB",
        .probe          = mpc832x_rdb_probe,
        .setup_arch     = mpc832x_rdb_setup_arch,
+       .discover_phbs  = mpc83xx_setup_pci,
        .init_IRQ       = mpc83xx_ipic_init_IRQ,
        .get_irq        = ipic_get_irq,
        .restart        = mpc83xx_restart,
index ebfd139..9630f3a 100644 (file)
@@ -70,6 +70,7 @@ define_machine(mpc834x_itx) {
        .name                   = "MPC834x ITX",
        .probe                  = mpc834x_itx_probe,
        .setup_arch             = mpc834x_itx_setup_arch,
+       .discover_phbs          = mpc83xx_setup_pci,
        .init_IRQ               = mpc83xx_ipic_init_IRQ,
        .get_irq                = ipic_get_irq,
        .restart                = mpc83xx_restart,
index 356228e..6d91bdc 100644 (file)
@@ -91,6 +91,7 @@ define_machine(mpc834x_mds) {
        .name                   = "MPC834x MDS",
        .probe                  = mpc834x_mds_probe,
        .setup_arch             = mpc834x_mds_setup_arch,
+       .discover_phbs          = mpc83xx_setup_pci,
        .init_IRQ               = mpc83xx_ipic_init_IRQ,
        .get_irq                = ipic_get_irq,
        .restart                = mpc83xx_restart,
index 90d9cbf..da4cf52 100644 (file)
@@ -201,6 +201,7 @@ define_machine(mpc836x_mds) {
        .name           = "MPC836x MDS",
        .probe          = mpc836x_mds_probe,
        .setup_arch     = mpc836x_mds_setup_arch,
+       .discover_phbs  = mpc83xx_setup_pci,
        .init_IRQ       = mpc83xx_ipic_init_IRQ,
        .get_irq        = ipic_get_irq,
        .restart        = mpc83xx_restart,
index b4aac2c..3427ad0 100644 (file)
@@ -41,6 +41,7 @@ define_machine(mpc836x_rdk) {
        .name           = "MPC836x RDK",
        .probe          = mpc836x_rdk_probe,
        .setup_arch     = mpc836x_rdk_setup_arch,
+       .discover_phbs  = mpc83xx_setup_pci,
        .init_IRQ       = mpc83xx_ipic_init_IRQ,
        .get_irq        = ipic_get_irq,
        .restart        = mpc83xx_restart,
index 9d3721c..f28d166 100644 (file)
@@ -93,6 +93,7 @@ define_machine(mpc837x_mds) {
        .name                   = "MPC837x MDS",
        .probe                  = mpc837x_mds_probe,
        .setup_arch             = mpc837x_mds_setup_arch,
+       .discover_phbs          = mpc83xx_setup_pci,
        .init_IRQ               = mpc83xx_ipic_init_IRQ,
        .get_irq                = ipic_get_irq,
        .restart                = mpc83xx_restart,
index 7c45f7a..7fb7684 100644 (file)
@@ -73,6 +73,7 @@ define_machine(mpc837x_rdb) {
        .name                   = "MPC837x RDB/WLAN",
        .probe                  = mpc837x_rdb_probe,
        .setup_arch             = mpc837x_rdb_setup_arch,
+       .discover_phbs          = mpc83xx_setup_pci,
        .init_IRQ               = mpc83xx_ipic_init_IRQ,
        .get_irq                = ipic_get_irq,
        .restart                = mpc83xx_restart,
index f37d043..a30d305 100644 (file)
@@ -76,7 +76,7 @@ extern void mpc83xx_ipic_init_IRQ(void);
 #ifdef CONFIG_PCI
 extern void mpc83xx_setup_pci(void);
 #else
-#define mpc83xx_setup_pci()    do {} while (0)
+#define mpc83xx_setup_pci      NULL
 #endif
 
 extern int mpc83xx_declare_of_platform_devices(void);
index 88dedf3..6563659 100644 (file)
@@ -26,7 +26,7 @@ int machine_check_8xx(struct pt_regs *regs)
         * to deal with that than having a wart in the mcheck handler.
         * -- BenH
         */
-       bad_page_fault(regs, regs->dar, SIGBUS);
+       bad_page_fault(regs, SIGBUS);
        return 1;
 #else
        return 0;
index f5d0bf9..9d252c5 100644 (file)
@@ -65,6 +65,12 @@ static int __init amigaone_add_bridge(struct device_node *dev)
 }
 
 void __init amigaone_setup_arch(void)
+{
+       if (ppc_md.progress)
+               ppc_md.progress("Linux/PPC "UTS_RELEASE"\n", 0);
+}
+
+static void __init amigaone_discover_phbs(void)
 {
        struct device_node *np;
        int phb = -ENODEV;
@@ -74,9 +80,6 @@ void __init amigaone_setup_arch(void)
                phb = amigaone_add_bridge(np);
 
        BUG_ON(phb != 0);
-
-       if (ppc_md.progress)
-               ppc_md.progress("Linux/PPC "UTS_RELEASE"\n", 0);
 }
 
 void __init amigaone_init_IRQ(void)
@@ -159,6 +162,7 @@ define_machine(amigaone) {
        .name                   = "AmigaOne",
        .probe                  = amigaone_probe,
        .setup_arch             = amigaone_setup_arch,
+       .discover_phbs          = amigaone_discover_phbs,
        .show_cpuinfo           = amigaone_show_cpuinfo,
        .init_IRQ               = amigaone_init_IRQ,
        .restart                = amigaone_restart,
index 9068ede..5b9a7e9 100644 (file)
@@ -25,6 +25,7 @@
 #include <asm/cpu_has_feature.h>
 
 #include "pervasive.h"
+#include "ras.h"
 
 static void cbe_power_save(void)
 {
index c6fccad..0da74ab 100644 (file)
@@ -13,9 +13,6 @@
 #define PERVASIVE_H
 
 extern void cbe_pervasive_init(void);
-extern void cbe_system_error_exception(struct pt_regs *regs);
-extern void cbe_maintenance_exception(struct pt_regs *regs);
-extern void cbe_thermal_exception(struct pt_regs *regs);
 
 #ifdef CONFIG_PPC_IBM_CELL_RESETBUTTON
 extern int cbe_sysreset_hack(void);
index 6ea4805..4325c05 100644 (file)
@@ -49,7 +49,7 @@ static void dump_fir(int cpu)
 
 }
 
-void cbe_system_error_exception(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(cbe_system_error_exception)
 {
        int cpu = smp_processor_id();
 
@@ -58,7 +58,7 @@ void cbe_system_error_exception(struct pt_regs *regs)
        dump_stack();
 }
 
-void cbe_maintenance_exception(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(cbe_maintenance_exception)
 {
        int cpu = smp_processor_id();
 
@@ -70,7 +70,7 @@ void cbe_maintenance_exception(struct pt_regs *regs)
        dump_stack();
 }
 
-void cbe_thermal_exception(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(cbe_thermal_exception)
 {
        int cpu = smp_processor_id();
 
index 6c2e6bc..226dbd4 100644 (file)
@@ -2,9 +2,12 @@
 #ifndef RAS_H
 #define RAS_H
 
-extern void cbe_system_error_exception(struct pt_regs *regs);
-extern void cbe_maintenance_exception(struct pt_regs *regs);
-extern void cbe_thermal_exception(struct pt_regs *regs);
+#include <asm/interrupt.h>
+
+DECLARE_INTERRUPT_HANDLER(cbe_system_error_exception);
+DECLARE_INTERRUPT_HANDLER(cbe_maintenance_exception);
+DECLARE_INTERRUPT_HANDLER(cbe_thermal_exception);
+
 extern void cbe_ras_init(void);
 
 #endif /* RAS_H */
index b2c2bf3..8c421dc 100644 (file)
@@ -314,6 +314,14 @@ chrp_find_bridges(void)
                }
        }
        of_node_put(root);
+
+       /*
+        *  "Temporary" fixes for PCI devices.
+        *  -- Geert
+        */
+       hydra_init();           /* Mac I/O */
+
+       pci_create_OF_bus_map();
 }
 
 /* SL82C105 IDE Control/Status Register */
index c45435a..3cfc382 100644 (file)
@@ -334,22 +334,11 @@ static void __init chrp_setup_arch(void)
        /* On pegasos, enable the L2 cache if not already done by OF */
        pegasos_set_l2cr();
 
-       /* Lookup PCI host bridges */
-       chrp_find_bridges();
-
-       /*
-        *  Temporary fixes for PCI devices.
-        *  -- Geert
-        */
-       hydra_init();           /* Mac I/O */
-
        /*
         *  Fix the Super I/O configuration
         */
        sio_init();
 
-       pci_create_OF_bus_map();
-
        /*
         * Print the banner, then scroll down so boot progress
         * can be printed.  -- Cort
@@ -582,6 +571,7 @@ define_machine(chrp) {
        .name                   = "CHRP",
        .probe                  = chrp_probe,
        .setup_arch             = chrp_setup_arch,
+       .discover_phbs          = chrp_find_bridges,
        .init                   = chrp_init2,
        .show_cpuinfo           = chrp_show_cpuinfo,
        .init_IRQ               = chrp_init_IRQ,
index d8f2e2c..53065d5 100644 (file)
@@ -108,15 +108,13 @@ static void holly_remap_bridge(void)
        tsi108_write_reg(TSI108_PCI_P2O_BAR2, 0x0);
 }
 
-static void __init holly_setup_arch(void)
+static void __init holly_init_pci(void)
 {
        struct device_node *np;
 
        if (ppc_md.progress)
                ppc_md.progress("holly_setup_arch():set_bridge", 0);
 
-       tsi108_csr_vir_base = get_vir_csrbase();
-
        /* setup PCI host bridge */
        holly_remap_bridge();
 
@@ -127,6 +125,11 @@ static void __init holly_setup_arch(void)
        ppc_md.pci_exclude_device = holly_exclude_device;
        if (ppc_md.progress)
                ppc_md.progress("tsi108: resources set", 0x100);
+}
+
+static void __init holly_setup_arch(void)
+{
+       tsi108_csr_vir_base = get_vir_csrbase();
 
        printk(KERN_INFO "PPC750GX/CL Platform\n");
 }
@@ -259,6 +262,7 @@ define_machine(holly){
        .name                           = "PPC750 GX/CL TSI",
        .probe                          = holly_probe,
        .setup_arch                     = holly_setup_arch,
+       .discover_phbs                  = holly_init_pci,
        .init_IRQ                       = holly_init_IRQ,
        .show_cpuinfo                   = holly_show_cpuinfo,
        .get_irq                        = mpic_get_irq,
index f514d5d..eb8342e 100644 (file)
@@ -63,15 +63,18 @@ static int __init linkstation_add_bridge(struct device_node *dev)
 }
 
 static void __init linkstation_setup_arch(void)
+{
+       printk(KERN_INFO "BUFFALO Network Attached Storage Series\n");
+       printk(KERN_INFO "(C) 2002-2005 BUFFALO INC.\n");
+}
+
+static void __init linkstation_setup_pci(void)
 {
        struct device_node *np;
 
        /* Lookup PCI host bridges */
        for_each_compatible_node(np, "pci", "mpc10x-pci")
                linkstation_add_bridge(np);
-
-       printk(KERN_INFO "BUFFALO Network Attached Storage Series\n");
-       printk(KERN_INFO "(C) 2002-2005 BUFFALO INC.\n");
 }
 
 /*
@@ -153,6 +156,7 @@ define_machine(linkstation){
        .name                   = "Buffalo Linkstation",
        .probe                  = linkstation_probe,
        .setup_arch             = linkstation_setup_arch,
+       .discover_phbs          = linkstation_setup_pci,
        .init_IRQ               = linkstation_init_IRQ,
        .show_cpuinfo           = linkstation_show_cpuinfo,
        .get_irq                = mpic_get_irq,
index b95c338..5565647 100644 (file)
@@ -58,16 +58,14 @@ int mpc7448_hpc2_exclude_device(struct pci_controller *hose,
                return PCIBIOS_SUCCESSFUL;
 }
 
-static void __init mpc7448_hpc2_setup_arch(void)
+static void __init mpc7448_hpc2_setup_pci(void)
 {
+#ifdef CONFIG_PCI
        struct device_node *np;
        if (ppc_md.progress)
-               ppc_md.progress("mpc7448_hpc2_setup_arch():set_bridge", 0);
-
-       tsi108_csr_vir_base = get_vir_csrbase();
+               ppc_md.progress("mpc7448_hpc2_setup_pci():set_bridge", 0);
 
        /* setup PCI host bridge */
-#ifdef CONFIG_PCI
        for_each_compatible_node(np, "pci", "tsi108-pci")
                tsi108_setup_pci(np, MPC7448HPC2_PCI_CFG_PHYS, 0);
 
@@ -75,6 +73,11 @@ static void __init mpc7448_hpc2_setup_arch(void)
        if (ppc_md.progress)
                ppc_md.progress("tsi108: resources set", 0x100);
 #endif
+}
+
+static void __init mpc7448_hpc2_setup_arch(void)
+{
+       tsi108_csr_vir_base = get_vir_csrbase();
 
        printk(KERN_INFO "MPC7448HPC2 (TAIGA) Platform\n");
        printk(KERN_INFO
@@ -181,6 +184,7 @@ define_machine(mpc7448_hpc2){
        .name                   = "MPC7448 HPC2",
        .probe                  = mpc7448_hpc2_probe,
        .setup_arch             = mpc7448_hpc2_setup_arch,
+       .discover_phbs          = mpc7448_hpc2_setup_pci,
        .init_IRQ               = mpc7448_hpc2_init_IRQ,
        .show_cpuinfo           = mpc7448_hpc2_show_cpuinfo,
        .get_irq                = mpic_get_irq,
index 1cd488d..c06a049 100644 (file)
@@ -154,17 +154,19 @@ static const struct of_device_id mvme5100_of_bus_ids[] __initconst = {
  */
 static void __init mvme5100_setup_arch(void)
 {
-       struct device_node *np;
-
        if (ppc_md.progress)
                ppc_md.progress("mvme5100_setup_arch()", 0);
 
-       for_each_compatible_node(np, "pci", "hawk-pci")
-               mvme5100_add_bridge(np);
-
        restart = ioremap(BOARD_MODRST_REG, 4);
 }
 
+static void __init mvme5100_setup_pci(void)
+{
+       struct device_node *np;
+
+       for_each_compatible_node(np, "pci", "hawk-pci")
+               mvme5100_add_bridge(np);
+}
 
 static void mvme5100_show_cpuinfo(struct seq_file *m)
 {
@@ -205,6 +207,7 @@ define_machine(mvme5100) {
        .name                   = "MVME5100",
        .probe                  = mvme5100_probe,
        .setup_arch             = mvme5100_setup_arch,
+       .discover_phbs          = mvme5100_setup_pci,
        .init_IRQ               = mvme5100_pic_init,
        .show_cpuinfo           = mvme5100_show_cpuinfo,
        .get_irq                = mpic_get_irq,
index e346ddc..e188b90 100644 (file)
@@ -65,14 +65,17 @@ static int __init storcenter_add_bridge(struct device_node *dev)
 }
 
 static void __init storcenter_setup_arch(void)
+{
+       printk(KERN_INFO "IOMEGA StorCenter\n");
+}
+
+static void __init storcenter_setup_pci(void)
 {
        struct device_node *np;
 
        /* Lookup PCI host bridges */
        for_each_compatible_node(np, "pci", "mpc10x-pci")
                storcenter_add_bridge(np);
-
-       printk(KERN_INFO "IOMEGA StorCenter\n");
 }
 
 /*
@@ -117,6 +120,7 @@ define_machine(storcenter){
        .name                   = "IOMEGA StorCenter",
        .probe                  = storcenter_probe,
        .setup_arch             = storcenter_setup_arch,
+       .discover_phbs          = storcenter_setup_pci,
        .init_IRQ               = storcenter_init_IRQ,
        .get_irq                = mpic_get_irq,
        .restart                = storcenter_restart,
index c86a66d..a20b957 100644 (file)
@@ -536,6 +536,9 @@ static int __init maple_add_bridge(struct device_node *dev)
        /* Check for legacy IOs */
        isa_bridge_find_early(hose);
 
+       /* create pci_dn's for DT nodes under this PHB */
+       pci_devs_phb_init_dynamic(hose);
+
        return 0;
 }
 
index f7e66a2..4e9ad5b 100644 (file)
@@ -179,9 +179,6 @@ static void __init maple_setup_arch(void)
 #ifdef CONFIG_SMP
        smp_ops = &maple_smp_ops;
 #endif
-       /* Lookup PCI hosts */
-               maple_pci_init();
-
        maple_use_rtas_reboot_and_halt_if_present();
 
        printk(KERN_DEBUG "Using native/NAP idle loop\n");
@@ -351,6 +348,7 @@ define_machine(maple) {
        .name                   = "Maple",
        .probe                  = maple_probe,
        .setup_arch             = maple_setup_arch,
+       .discover_phbs          = maple_pci_init,
        .init_IRQ               = maple_init_IRQ,
        .pci_irq_fixup          = maple_pci_irq_fixup,
        .pci_get_legacy_ide_irq = maple_pci_get_legacy_ide_irq,
index b612474..376797e 100644 (file)
@@ -144,8 +144,6 @@ static void __init pas_setup_arch(void)
        /* Setup SMP callback */
        smp_ops = &pas_smp_ops;
 #endif
-       /* Lookup PCI hosts */
-       pas_pci_init();
 
        /* Remap SDC register for doing reset */
        /* XXXOJN This should maybe come out of the device tree */
@@ -446,6 +444,7 @@ define_machine(pasemi) {
        .name                   = "PA Semi PWRficient",
        .probe                  = pas_probe,
        .setup_arch             = pas_setup_arch,
+       .discover_phbs          = pas_pci_init,
        .init_IRQ               = pas_init_IRQ,
        .get_irq                = mpic_get_irq,
        .restart                = pas_restart,
index e35eaa9..e9abe0f 100644 (file)
@@ -850,6 +850,10 @@ static int __init pmac_add_bridge(struct device_node *dev)
        /* Fixup "bus-range" OF property */
        fixup_bus_range(dev);
 
+       /* create pci_dn's for DT nodes under this PHB */
+       if (IS_ENABLED(CONFIG_PPC64))
+               pci_devs_phb_init_dynamic(hose);
+
        return 0;
 }
 
index 2e2cc0c..86aee3f 100644 (file)
@@ -298,9 +298,6 @@ static void __init pmac_setup_arch(void)
                of_node_put(ic);
        }
 
-       /* Lookup PCI hosts */
-       pmac_pci_init();
-
 #ifdef CONFIG_PPC32
        ohare_init();
        l2cr_init();
@@ -600,6 +597,7 @@ define_machine(powermac) {
        .name                   = "PowerMac",
        .probe                  = pmac_probe,
        .setup_arch             = pmac_setup_arch,
+       .discover_phbs          = pmac_pci_init,
        .show_cpuinfo           = pmac_show_cpuinfo,
        .init_IRQ               = pmac_pic_init,
        .get_irq                = NULL, /* changed later */
index e6f4618..999997d 100644 (file)
@@ -14,6 +14,7 @@
 
 #include <asm/asm-prototypes.h>
 #include <asm/firmware.h>
+#include <asm/interrupt.h>
 #include <asm/machdep.h>
 #include <asm/opal.h>
 #include <asm/cputhreads.h>
index 5fc9408..019669e 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/numa.h>
 #include <asm/machdep.h>
 #include <asm/debugfs.h>
+#include <asm/cacheflush.h>
 
 /* This enables us to keep track of the memory removed from each node. */
 struct memtrace_entry {
@@ -51,6 +52,27 @@ static const struct file_operations memtrace_fops = {
        .open   = simple_open,
 };
 
+#define FLUSH_CHUNK_SIZE SZ_1G
+/**
+ * flush_dcache_range_chunked(): Write any modified data cache blocks out to
+ * memory and invalidate them, in chunks of up to FLUSH_CHUNK_SIZE
+ * Does not invalidate the corresponding instruction cache blocks.
+ *
+ * @start: the start address
+ * @stop: the stop address (exclusive)
+ * @chunk: the max size of the chunks
+ */
+static void flush_dcache_range_chunked(unsigned long start, unsigned long stop,
+                                      unsigned long chunk)
+{
+       unsigned long i;
+
+       for (i = start; i < stop; i += chunk) {
+               flush_dcache_range(i, min(stop, i + chunk));
+               cond_resched();
+       }
+}
+
 static void memtrace_clear_range(unsigned long start_pfn,
                                 unsigned long nr_pages)
 {
@@ -62,6 +84,13 @@ static void memtrace_clear_range(unsigned long start_pfn,
                        cond_resched();
                clear_page(__va(PFN_PHYS(pfn)));
        }
+       /*
+        * Before we go ahead and use this range as cache inhibited range
+        * flush the cache.
+        */
+       flush_dcache_range_chunked(PFN_PHYS(start_pfn),
+                                  PFN_PHYS(start_pfn + nr_pages),
+                                  FLUSH_CHUNK_SIZE);
 }
 
 static u64 memtrace_alloc_node(u32 nid, u64 size)
index c61c3b6..303d7c7 100644 (file)
@@ -624,7 +624,7 @@ static int opal_recover_mce(struct pt_regs *regs,
                         */
                        recovered = 0;
                } else {
-                       die("Machine check", regs, SIGBUS);
+                       die_mce("Machine check", regs, SIGBUS);
                        recovered = 1;
                }
        }
index 5218f5d..30551bb 100644 (file)
@@ -380,6 +380,8 @@ void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
 
        /* Remove link to a group from table's list of attached groups */
        found = false;
+
+       rcu_read_lock();
        list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
                if (tgl->table_group == table_group) {
                        list_del_rcu(&tgl->next);
@@ -388,6 +390,8 @@ void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
                        break;
                }
        }
+       rcu_read_unlock();
+
        if (WARN_ON(!found))
                return;
 
index c4f72cd..f0f9016 100644 (file)
@@ -2402,9 +2402,6 @@ static void pnv_pci_ioda_create_dbgfs(void)
        list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
                phb = hose->private_data;
 
-               /* Notify initialization of PHB done */
-               phb->initialized = 1;
-
                sprintf(name, "PCI%04x", hose->global_number);
                phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
 
@@ -2601,17 +2598,8 @@ static resource_size_t pnv_pci_default_alignment(void)
  */
 static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
 {
-       struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
        struct pci_dn *pdn;
 
-       /* The function is probably called while the PEs have
-        * not be created yet. For example, resource reassignment
-        * during PCI probe period. We just skip the check if
-        * PEs isn't ready.
-        */
-       if (!phb->initialized)
-               return true;
-
        pdn = pci_get_pdn(dev);
        if (!pdn || pdn->pe_number == IODA_INVALID_PE) {
                pci_err(dev, "pci_enable_device() blocked, no PE assigned.\n");
@@ -2623,14 +2611,9 @@ static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
 
 static bool pnv_ocapi_enable_device_hook(struct pci_dev *dev)
 {
-       struct pci_controller *hose = pci_bus_to_host(dev->bus);
-       struct pnv_phb *phb = hose->private_data;
        struct pci_dn *pdn;
        struct pnv_ioda_pe *pe;
 
-       if (!phb->initialized)
-               return true;
-
        pdn = pci_get_pdn(dev);
        if (!pdn)
                return false;
@@ -2938,7 +2921,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
        phb_id = be64_to_cpup(prop64);
        pr_debug("  PHB-ID  : 0x%016llx\n", phb_id);
 
-       phb = memblock_alloc(sizeof(*phb), SMP_CACHE_BYTES);
+       phb = kzalloc(sizeof(*phb), GFP_KERNEL);
        if (!phb)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      sizeof(*phb));
@@ -2987,7 +2970,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
        else
                phb->diag_data_size = PNV_PCI_DIAG_BUF_SIZE;
 
-       phb->diag_data = memblock_alloc(phb->diag_data_size, SMP_CACHE_BYTES);
+       phb->diag_data = kzalloc(phb->diag_data_size, GFP_KERNEL);
        if (!phb->diag_data)
                panic("%s: Failed to allocate %u bytes\n", __func__,
                      phb->diag_data_size);
@@ -3049,9 +3032,10 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
        }
        pemap_off = size;
        size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe);
-       aux = memblock_alloc(size, SMP_CACHE_BYTES);
+       aux = kzalloc(size, GFP_KERNEL);
        if (!aux)
                panic("%s: Failed to allocate %lu bytes\n", __func__, size);
+
        phb->ioda.pe_alloc = aux;
        phb->ioda.m64_segmap = aux + m64map_off;
        phb->ioda.m32_segmap = aux + m32map_off;
@@ -3178,6 +3162,9 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
        /* Remove M64 resource if we can't configure it successfully */
        if (!phb->init_m64 || phb->init_m64(phb))
                hose->mem_resources[1].flags = 0;
+
+       /* create pci_dn's for DT nodes under this PHB */
+       pci_devs_phb_init_dynamic(hose);
 }
 
 void __init pnv_pci_init_ioda2_phb(struct device_node *np)
index 739a0b3..36d2292 100644 (file)
@@ -119,7 +119,6 @@ struct pnv_phb {
        int                     flags;
        void __iomem            *regs;
        u64                     regs_phys;
-       int                     initialized;
        spinlock_t              lock;
 
 #ifdef CONFIG_DEBUG_FS
index 4426a10..aadf932 100644 (file)
@@ -180,9 +180,6 @@ static void __init pnv_setup_arch(void)
        /* Initialize SMP */
        pnv_smp_init();
 
-       /* Setup PCI */
-       pnv_pci_init();
-
        /* Setup RTC and NVRAM callbacks */
        if (firmware_has_feature(FW_FEATURE_OPAL))
                opal_nvram_init();
@@ -547,6 +544,7 @@ define_machine(powernv) {
        .init_IRQ               = pnv_init_IRQ,
        .show_cpuinfo           = pnv_show_cpuinfo,
        .get_proc_freq          = pnv_get_proc_freq,
+       .discover_phbs          = pnv_pci_init,
        .progress               = pnv_progress,
        .machine_shutdown       = pnv_shutdown,
        .power_save             = NULL,
index c8f574d..77feee8 100644 (file)
@@ -15,7 +15,7 @@
 void split_core_secondary_loop(u8 *state);
 extern void update_subcore_sibling_mask(void);
 #else
-static inline void update_subcore_sibling_mask(void) { };
+static inline void update_subcore_sibling_mask(void) { }
 #endif /* CONFIG_SMP */
 
 #endif /* __ASSEMBLY__ */
index 598e4cd..b65256a 100644 (file)
@@ -28,12 +28,10 @@ static DEFINE_PER_CPU(int, cpu_vas_id);
 
 static int vas_irq_fault_window_setup(struct vas_instance *vinst)
 {
-       char devname[64];
        int rc = 0;
 
-       snprintf(devname, sizeof(devname), "vas-%d", vinst->vas_id);
        rc = request_threaded_irq(vinst->virq, vas_fault_handler,
-                               vas_fault_thread_fn, 0, devname, vinst);
+                               vas_fault_thread_fn, 0, vinst->name, vinst);
 
        if (rc) {
                pr_err("VAS[%d]: Request IRQ(%d) failed with %d\n",
@@ -80,6 +78,12 @@ static int init_vas_instance(struct platform_device *pdev)
        if (!vinst)
                return -ENOMEM;
 
+       vinst->name = kasprintf(GFP_KERNEL, "vas-%d", vasid);
+       if (!vinst->name) {
+               kfree(vinst);
+               return -ENOMEM;
+       }
+
        INIT_LIST_HEAD(&vinst->node);
        ida_init(&vinst->ida);
        mutex_init(&vinst->mutex);
@@ -162,6 +166,7 @@ static int init_vas_instance(struct platform_device *pdev)
        return 0;
 
 free_vinst:
+       kfree(vinst->name);
        kfree(vinst);
        return -ENODEV;
 
index 70f793e..c7db319 100644 (file)
@@ -340,6 +340,7 @@ struct vas_instance {
        struct vas_window *rxwin[VAS_COP_TYPE_MAX];
        struct vas_window *windows[VAS_WINDOWS_PER_CHIP];
 
+       char *name;
        char *dbgname;
        struct dentry *dbgdir;
 };
index 16e86ba..233503f 100644 (file)
@@ -127,7 +127,6 @@ void dlpar_free_cc_nodes(struct device_node *dn)
 #define NEXT_PROPERTY   3
 #define PREV_PARENT     4
 #define MORE_MEMORY     5
-#define CALL_AGAIN     -2
 #define ERR_CFG_USE     -9003
 
 struct device_node *dlpar_configure_connector(__be32 drc_index,
@@ -168,6 +167,9 @@ struct device_node *dlpar_configure_connector(__be32 drc_index,
 
                spin_unlock(&rtas_data_buf_lock);
 
+               if (rtas_busy_delay(rc))
+                       continue;
+
                switch (rc) {
                case COMPLETE:
                        break;
@@ -216,9 +218,6 @@ struct device_node *dlpar_configure_connector(__be32 drc_index,
                        last_dn = last_dn->parent;
                        break;
 
-               case CALL_AGAIN:
-                       break;
-
                case MORE_MEMORY:
                case ERR_CFG_USE:
                default:
@@ -521,11 +520,8 @@ static ssize_t dlpar_store(struct class *class, struct class_attribute *attr,
        int rc;
 
        args = argbuf = kstrdup(buf, GFP_KERNEL);
-       if (!argbuf) {
-               pr_info("Could not allocate resources for DLPAR operation\n");
-               kfree(argbuf);
+       if (!argbuf)
                return -ENOMEM;
-       }
 
        /*
         * Parse out the request from the user, this will be in the form:
index cf024fa..bc15200 100644 (file)
@@ -43,7 +43,7 @@ static int ibm_get_config_addr_info;
 static int ibm_get_config_addr_info2;
 static int ibm_configure_pe;
 
-void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
+static void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
 {
        struct pci_dn *pdn = pci_get_pdn(pdev);
 
@@ -694,8 +694,7 @@ static int pseries_eeh_write_config(struct eeh_dev *edev, int where, int size, u
 }
 
 #ifdef CONFIG_PCI_IOV
-int pseries_send_allow_unfreeze(struct pci_dn *pdn,
-                               u16 *vf_pe_array, int cur_vfs)
+static int pseries_send_allow_unfreeze(struct pci_dn *pdn, u16 *vf_pe_array, int cur_vfs)
 {
        int rc;
        int ibm_allow_unfreeze = rtas_token("ibm,open-sriov-allow-unfreeze");
index 72a4d41..1bffbd1 100644 (file)
@@ -55,9 +55,8 @@ struct pe_map_bar_entry {
        __be32     reserved;  /* Reserved Space */
 };
 
-int pseries_send_map_pe(struct pci_dev *pdev,
-                       u16 num_vfs,
-                       struct pe_map_bar_entry *vf_pe_array)
+static int pseries_send_map_pe(struct pci_dev *pdev, u16 num_vfs,
+                              struct pe_map_bar_entry *vf_pe_array)
 {
        struct pci_dn *pdn;
        int rc;
@@ -88,7 +87,7 @@ int pseries_send_map_pe(struct pci_dev *pdev,
        return rc;
 }
 
-void pseries_set_pe_num(struct pci_dev *pdev, u16 vf_index, __be16 pe_num)
+static void pseries_set_pe_num(struct pci_dev *pdev, u16 vf_index, __be16 pe_num)
 {
        struct pci_dn *pdn;
 
@@ -102,7 +101,7 @@ void pseries_set_pe_num(struct pci_dev *pdev, u16 vf_index, __be16 pe_num)
                pdn->pe_num_map[vf_index]);
 }
 
-int pseries_associate_pes(struct pci_dev *pdev, u16 num_vfs)
+static int pseries_associate_pes(struct pci_dev *pdev, u16 num_vfs)
 {
        struct pci_dn *pdn;
        int i, rc, vf_index;
@@ -146,7 +145,7 @@ int pseries_associate_pes(struct pci_dev *pdev, u16 num_vfs)
        return rc;
 }
 
-int pseries_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+static int pseries_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 {
        struct pci_dn         *pdn;
        int                    rc;
@@ -189,14 +188,14 @@ int pseries_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
        return rc;
 }
 
-int pseries_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+static int pseries_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 {
        /* Allocate PCI data */
        add_sriov_vf_pdns(pdev);
        return pseries_pci_sriov_enable(pdev, num_vfs);
 }
 
-int pseries_pcibios_sriov_disable(struct pci_dev *pdev)
+static int pseries_pcibios_sriov_disable(struct pci_dev *pdev)
 {
        struct pci_dn         *pdn;
 
index 5938408..4fe48c0 100644 (file)
@@ -33,7 +33,7 @@ int smp_query_cpu_stopped(unsigned int pcpu);
 #define QCSS_HARDWARE_ERROR -1
 #define QCSS_HARDWARE_BUSY -2
 #else
-static inline void smp_init_pseries(void) { };
+static inline void smp_init_pseries(void) { }
 #endif
 
 extern void pseries_kexec_cpu_down(int crash_shutdown, int secondary);
index 149cec2..f8b390a 100644 (file)
@@ -122,7 +122,7 @@ static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog)
  * devices or systems (e.g. hugepages) that have not been initialized at the
  * subsys stage.
  */
-int __init init_ras_hotplug_IRQ(void)
+static int __init init_ras_hotplug_IRQ(void)
 {
        struct device_node *np;
 
@@ -315,12 +315,10 @@ static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id)
 /* Handle environmental and power warning (EPOW) interrupts. */
 static irqreturn_t ras_epow_interrupt(int irq, void *dev_id)
 {
-       int status;
        int state;
        int critical;
 
-       status = rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX,
-                                     &state);
+       rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, &state);
 
        if (state > 3)
                critical = 1;           /* Time Critical */
@@ -329,12 +327,9 @@ static irqreturn_t ras_epow_interrupt(int irq, void *dev_id)
 
        spin_lock(&ras_log_buf_lock);
 
-       status = rtas_call(ras_check_exception_token, 6, 1, NULL,
-                          RTAS_VECTOR_EXTERNAL_INTERRUPT,
-                          virq_to_hw(irq),
-                          RTAS_EPOW_WARNING,
-                          critical, __pa(&ras_log_buf),
-                               rtas_get_error_log_max());
+       rtas_call(ras_check_exception_token, 6, 1, NULL, RTAS_VECTOR_EXTERNAL_INTERRUPT,
+                 virq_to_hw(irq), RTAS_EPOW_WARNING, critical, __pa(&ras_log_buf),
+                 rtas_get_error_log_max());
 
        log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0);
 
@@ -722,6 +717,7 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
        struct pseries_errorlog *pseries_log;
        struct pseries_mc_errorlog *mce_log = NULL;
        int disposition = rtas_error_disposition(errp);
+       unsigned long msr;
        u8 error_type;
 
        if (!rtas_error_extended(errp))
@@ -747,9 +743,21 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
         *       SLB multihit is done by now.
         */
 out:
-       mtmsr(mfmsr() | MSR_IR | MSR_DR);
+       msr = mfmsr();
+       mtmsr(msr | MSR_IR | MSR_DR);
+
        disposition = mce_handle_err_virtmode(regs, errp, mce_log,
                                              disposition);
+
+       /*
+        * Queue irq work to log this rtas event later.
+        * irq_work_queue uses per-cpu variables, so do this in virt
+        * mode as well.
+        */
+       irq_work_queue(&mce_errlog_process_work);
+
+       mtmsr(msr);
+
        return disposition;
 }
 
@@ -813,7 +821,7 @@ static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt)
                         */
                        recovered = 0;
                } else {
-                       die("Machine check", regs, SIGBUS);
+                       die_mce("Machine check", regs, SIGBUS);
                        recovered = 1;
                }
        }
@@ -865,10 +873,8 @@ long pseries_machine_check_realmode(struct pt_regs *regs)
                 * virtual mode.
                 */
                disposition = mce_handle_error(regs, errp);
-               fwnmi_release_errinfo();
 
-               /* Queue irq work to log this rtas event later. */
-               irq_work_queue(&mce_errlog_process_work);
+               fwnmi_release_errinfo();
 
                if (disposition == RTAS_DISP_FULLY_RECOVERED)
                        return 1;
index 090c13f..46e1540 100644 (file)
@@ -463,7 +463,7 @@ void pseries_little_endian_exceptions(void)
 }
 #endif
 
-static void __init find_and_init_phbs(void)
+static void __init pSeries_discover_phbs(void)
 {
        struct device_node *node;
        struct pci_controller *phb;
@@ -481,6 +481,9 @@ static void __init find_and_init_phbs(void)
                pci_process_bridge_OF_ranges(phb, node, 0);
                isa_bridge_find_early(phb);
                phb->controller_ops = pseries_pci_controller_ops;
+
+               /* create pci_dn's for DT nodes under this PHB */
+               pci_devs_phb_init_dynamic(phb);
        }
 
        of_node_put(root);
@@ -607,8 +610,8 @@ enum get_iov_fw_value_index {
        WDW_SIZE      = 3     /*  Get Window Size */
 };
 
-resource_size_t pseries_get_iov_fw_value(struct pci_dev *dev, int resno,
-                                        enum get_iov_fw_value_index value)
+static resource_size_t pseries_get_iov_fw_value(struct pci_dev *dev, int resno,
+                                               enum get_iov_fw_value_index value)
 {
        const int *indexes;
        struct device_node *dn = pci_device_to_OF_node(dev);
@@ -643,7 +646,7 @@ resource_size_t pseries_get_iov_fw_value(struct pci_dev *dev, int resno,
        return ret;
 }
 
-void of_pci_set_vf_bar_size(struct pci_dev *dev, const int *indexes)
+static void of_pci_set_vf_bar_size(struct pci_dev *dev, const int *indexes)
 {
        struct resource *res;
        resource_size_t base, size;
@@ -665,7 +668,7 @@ void of_pci_set_vf_bar_size(struct pci_dev *dev, const int *indexes)
        }
 }
 
-void of_pci_parse_iov_addrs(struct pci_dev *dev, const int *indexes)
+static void of_pci_parse_iov_addrs(struct pci_dev *dev, const int *indexes)
 {
        struct resource *res, *root, *conflict;
        resource_size_t base, size;
@@ -786,7 +789,6 @@ static void __init pSeries_setup_arch(void)
 
        /* Find and initialize PCI host bridges */
        init_pci_config_tokens();
-       find_and_init_phbs();
        of_reconfig_notifier_register(&pci_dn_reconfig_nb);
 
        pSeries_nvram_init();
@@ -1050,6 +1052,7 @@ define_machine(pseries) {
        .init_IRQ               = pseries_init_irq,
        .show_cpuinfo           = pSeries_show_cpuinfo,
        .log_error              = pSeries_log_error,
+       .discover_phbs          = pSeries_discover_phbs,
        .pcibios_fixup          = pSeries_final_fixup,
        .restart                = rtas_restart,
        .halt                   = rtas_halt,
index dcd817c..3fe3749 100644 (file)
@@ -1383,7 +1383,6 @@ static long check_bp_loc(unsigned long addr)
        return 1;
 }
 
-#ifndef CONFIG_PPC_8xx
 static int find_free_data_bpt(void)
 {
        int i;
@@ -1395,7 +1394,6 @@ static int find_free_data_bpt(void)
        printf("Couldn't find free breakpoint register\n");
        return -1;
 }
-#endif
 
 static void print_data_bpts(void)
 {
@@ -1435,7 +1433,6 @@ bpt_cmds(void)
        cmd = inchar();
 
        switch (cmd) {
-#ifndef CONFIG_PPC_8xx
        static const char badaddr[] = "Only kernel addresses are permitted for breakpoints\n";
        int mode;
        case 'd':       /* bd - hardware data breakpoint */
@@ -1497,7 +1494,6 @@ bpt_cmds(void)
                        force_enable_xmon();
                }
                break;
-#endif
 
        case 'c':
                if (!scanhex(&a)) {
@@ -3723,7 +3719,7 @@ void dump_segments(void)
 
        printf("sr0-15 =");
        for (i = 0; i < 16; ++i)
-               printf(" %x", mfsrin(i << 28));
+               printf(" %x", mfsr(i << 28));
        printf("\n");
 }
 #endif
index 2a1783f..53b9198 100644 (file)
@@ -170,8 +170,6 @@ int cxllib_get_PE_attributes(struct task_struct *task,
                             unsigned long translation_mode,
                             struct cxllib_pe_attributes *attr)
 {
-       struct mm_struct *mm = NULL;
-
        if (translation_mode != CXL_TRANSLATED_MODE &&
                translation_mode != CXL_REAL_MODE)
                return -EINVAL;
@@ -182,7 +180,7 @@ int cxllib_get_PE_attributes(struct task_struct *task,
                                true);
        attr->lpid = mfspr(SPRN_LPID);
        if (task) {
-               mm = get_task_mm(task);
+               struct mm_struct *mm = get_task_mm(task);
                if (mm == NULL)
                        return -EINVAL;
                /*
index 4d1b44d..e70525e 100644 (file)
@@ -15,7 +15,7 @@
 
 static dev_t ocxl_dev;
 static struct class *ocxl_class;
-static struct mutex minors_idr_lock;
+static DEFINE_MUTEX(minors_idr_lock);
 static struct idr minors_idr;
 
 static struct ocxl_file_info *find_and_get_file_info(dev_t devno)
@@ -588,7 +588,6 @@ int ocxl_file_init(void)
 {
        int rc;
 
-       mutex_init(&minors_idr_lock);
        idr_init(&minors_idr);
 
        rc = alloc_chrdev_region(&ocxl_dev, 0, OCXL_NUM_MINORS, "ocxl");
index 36f9415..124cba7 100644 (file)
@@ -120,7 +120,7 @@ static void mpc52xx_spi_start_transfer(struct mpc52xx_spi *ms)
        ms->cs_change = ms->transfer->cs_change;
 
        /* Write out the first byte */
-       ms->wcol_tx_timestamp = get_tbl();
+       ms->wcol_tx_timestamp = mftb();
        if (ms->tx_buf)
                out_8(ms->regs + SPI_DATA, *ms->tx_buf++);
        else
@@ -221,8 +221,8 @@ static int mpc52xx_spi_fsmstate_transfer(int irq, struct mpc52xx_spi *ms,
                 * but it can also be worked around simply by retrying the
                 * transfer which is what we do here. */
                ms->wcol_count++;
-               ms->wcol_ticks += get_tbl() - ms->wcol_tx_timestamp;
-               ms->wcol_tx_timestamp = get_tbl();
+               ms->wcol_ticks += mftb() - ms->wcol_tx_timestamp;
+               ms->wcol_tx_timestamp = mftb();
                data = 0;
                if (ms->tx_buf)
                        data = *(ms->tx_buf - 1);
@@ -247,7 +247,7 @@ static int mpc52xx_spi_fsmstate_transfer(int irq, struct mpc52xx_spi *ms,
        /* Is the transfer complete? */
        ms->len--;
        if (ms->len == 0) {
-               ms->timestamp = get_tbl();
+               ms->timestamp = mftb();
                if (ms->transfer->delay.unit == SPI_DELAY_UNIT_USECS)
                        ms->timestamp += ms->transfer->delay.value *
                                         tb_ticks_per_usec;
@@ -256,7 +256,7 @@ static int mpc52xx_spi_fsmstate_transfer(int irq, struct mpc52xx_spi *ms,
        }
 
        /* Write out the next byte */
-       ms->wcol_tx_timestamp = get_tbl();
+       ms->wcol_tx_timestamp = mftb();
        if (ms->tx_buf)
                out_8(ms->regs + SPI_DATA, *ms->tx_buf++);
        else
@@ -278,7 +278,7 @@ mpc52xx_spi_fsmstate_wait(int irq, struct mpc52xx_spi *ms, u8 status, u8 data)
                dev_err(&ms->master->dev, "spurious irq, status=0x%.2x\n",
                        status);
 
-       if (((int)get_tbl()) - ms->timestamp < 0)
+       if (((int)mftb()) - ms->timestamp < 0)
                return FSM_POLL;
 
        ms->message->actual_length += ms->transfer->len;
index 0d783e1..442b666 100755 (executable)
@@ -1,28 +1,13 @@
 #!/bin/sh
 # SPDX-License-Identifier: GPL-2.0-only
 
-KSELFTESTS_SKIP=4
-
 . ./eeh-functions.sh
 
-if ! eeh_supported ; then
-       echo "EEH not supported on this system, skipping"
-       exit $KSELFTESTS_SKIP;
-fi
-
-if [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \
-   [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then
-       echo "debugfs EEH testing files are missing. Is debugfs mounted?"
-       exit $KSELFTESTS_SKIP;
-fi
+eeh_test_prep # NB: may exit
 
 pre_lspci=`mktemp`
 lspci > $pre_lspci
 
-# Bump the max freeze count to something absurd so we don't
-# trip over it while breaking things.
-echo 5000 > /sys/kernel/debug/powerpc/eeh_max_freezes
-
 # record the devices that we break in here. Assuming everything
 # goes to plan we should get them back once the recover process
 # is finished.
@@ -30,34 +15,16 @@ devices=""
 
 # Build up a list of candidate devices.
 for dev in `ls -1 /sys/bus/pci/devices/ | grep '\.0$'` ; do
-       # skip bridges since we can't recover them (yet...)
-       if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then
-               echo "$dev, Skipped: bridge"
+       if ! eeh_can_break $dev ; then
                continue;
        fi
 
-       # Skip VFs for now since we don't have a reliable way
-       # to break them.
+       # Skip VFs for now since we don't have a reliable way to break them.
        if [ -e "/sys/bus/pci/devices/$dev/physfn" ] ; then
                echo "$dev, Skipped: virtfn"
                continue;
        fi
 
-       if [ "ahci" = "$(basename $(realpath /sys/bus/pci/devices/$dev/driver))" ] ; then
-               echo "$dev, Skipped: ahci doesn't support recovery"
-               continue
-       fi
-
-       # Don't inject errosr into an already-frozen PE. This happens with
-       # PEs that contain multiple PCI devices (e.g. multi-function cards)
-       # and injecting new errors during the recovery process will probably
-       # result in the recovery failing and the device being marked as
-       # failed.
-       if ! pe_ok $dev ; then
-               echo "$dev, Skipped: Bad initial PE state"
-               continue;
-       fi
-
        echo "$dev, Added"
 
        # Add to this list of device to check
@@ -86,5 +53,5 @@ echo "$failed devices failed to recover ($dev_count tested)"
 lspci | diff -u $pre_lspci -
 rm -f $pre_lspci
 
-test "$failed" == 0
+test "$failed" -eq 0
 exit $?
old mode 100755 (executable)
new mode 100644 (file)
index 00dc32c..70daa39
@@ -1,6 +1,12 @@
 #!/bin/sh
 # SPDX-License-Identifier: GPL-2.0-only
 
+export KSELFTESTS_SKIP=4
+
+log() {
+       echo >/dev/stderr $*
+}
+
 pe_ok() {
        local dev="$1"
        local path="/sys/bus/pci/devices/$dev/eeh_pe_state"
@@ -39,6 +45,52 @@ eeh_supported() {
        grep -q 'EEH Subsystem is enabled' /proc/powerpc/eeh
 }
 
+eeh_test_prep() {
+       if ! eeh_supported ; then
+               echo "EEH not supported on this system, skipping"
+               exit $KSELFTESTS_SKIP;
+       fi
+
+       if [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \
+          [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then
+               log "debugfs EEH testing files are missing. Is debugfs mounted?"
+               exit $KSELFTESTS_SKIP;
+       fi
+
+       # Bump the max freeze count to something absurd so we don't
+       # trip over it while breaking things.
+       echo 5000 > /sys/kernel/debug/powerpc/eeh_max_freezes
+}
+
+eeh_can_break() {
+       # skip bridges since we can't recover them (yet...)
+       if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then
+               log "$dev, Skipped: bridge"
+               return 1;
+       fi
+
+       # The ahci driver doesn't support error recovery. If the ahci device
+       # happens to be hosting the root filesystem, and then we go and break
+       # it the system will generally go down. We should probably fix that
+       # at some point
+       if [ "ahci" = "$(basename $(realpath /sys/bus/pci/devices/$dev/driver))" ] ; then
+               log "$dev, Skipped: ahci doesn't support recovery"
+               return 1;
+       fi
+
+       # Don't inject errosr into an already-frozen PE. This happens with
+       # PEs that contain multiple PCI devices (e.g. multi-function cards)
+       # and injecting new errors during the recovery process will probably
+       # result in the recovery failing and the device being marked as
+       # failed.
+       if ! pe_ok $dev ; then
+               log "$dev, Skipped: Bad initial PE state"
+               return 1;
+       fi
+
+       return 0
+}
+
 eeh_one_dev() {
        local dev="$1"
 
@@ -46,7 +98,7 @@ eeh_one_dev() {
        # testing so check that the argument is a well-formed sysfs device
        # name.
        if ! test -e /sys/bus/pci/devices/$dev/ ; then
-               echo "Error: '$dev' must be a sysfs device name (DDDD:BB:DD.F)"
+               log "Error: '$dev' must be a sysfs device name (DDDD:BB:DD.F)"
                return 1;
        fi
 
@@ -70,16 +122,124 @@ eeh_one_dev() {
                if pe_ok $dev ; then
                        break;
                fi
-               echo "$dev, waited $i/${max_wait}"
+               log "$dev, waited $i/${max_wait}"
                sleep 1
        done
 
        if ! pe_ok $dev ; then
-               echo "$dev, Failed to recover!"
+               log "$dev, Failed to recover!"
                return 1;
        fi
 
-       echo "$dev, Recovered after $i seconds"
+       log "$dev, Recovered after $i seconds"
        return 0;
 }
 
+eeh_has_driver() {
+       test -e /sys/bus/pci/devices/$1/driver;
+       return $?
+}
+
+eeh_can_recover() {
+       # we'll get an IO error if the device's current driver doesn't support
+       # error recovery
+       echo $1 > '/sys/kernel/debug/powerpc/eeh_dev_can_recover' 2>/dev/null
+
+       return $?
+}
+
+eeh_find_all_pfs() {
+       devices=""
+
+       # SR-IOV on pseries requires hypervisor support, so check for that
+       is_pseries=""
+       if grep -q pSeries /proc/cpuinfo ; then
+               if [ ! -f /proc/device-tree/rtas/ibm,open-sriov-allow-unfreeze ] ||
+                  [ ! -f /proc/device-tree/rtas/ibm,open-sriov-map-pe-number ] ; then
+                       return 1;
+               fi
+
+               is_pseries="true"
+       fi
+
+       for dev in `ls -1 /sys/bus/pci/devices/` ; do
+               sysfs="/sys/bus/pci/devices/$dev"
+               if [ ! -e "$sysfs/sriov_numvfs" ] ; then
+                       continue
+               fi
+
+               # skip unsupported PFs on pseries
+               if [ -z "$is_pseries" ] &&
+                  [ ! -f "$sysfs/of_node/ibm,is-open-sriov-pf" ] &&
+                  [ ! -f "$sysfs/of_node/ibm,open-sriov-vf-bar-info" ] ; then
+                       continue;
+               fi
+
+               # no driver, no vfs
+               if ! eeh_has_driver $dev ; then
+                       continue
+               fi
+
+               devices="$devices $dev"
+       done
+
+       if [ -z "$devices" ] ; then
+               return 1;
+       fi
+
+       echo $devices
+       return 0;
+}
+
+# attempts to enable one VF on each PF so we can do VF specific tests.
+# stdout: list of enabled VFs, one per line
+# return code: 0 if vfs are found, 1 otherwise
+eeh_enable_vfs() {
+       pf_list="$(eeh_find_all_pfs)"
+
+       vfs=0
+       for dev in $pf_list ; do
+               pf_sysfs="/sys/bus/pci/devices/$dev"
+
+               # make sure we have a single VF
+               echo 0 > "$pf_sysfs/sriov_numvfs"
+               echo 1 > "$pf_sysfs/sriov_numvfs"
+               if [ "$?" != 0 ] ; then
+                       log "Unable to enable VFs on $pf, skipping"
+                       continue;
+               fi
+
+               vf="$(basename $(realpath "$pf_sysfs/virtfn0"))"
+               if [ $? != 0 ] ; then
+                       log "unable to find enabled vf on $pf"
+                       echo 0 > "$pf_sysfs/sriov_numvfs"
+                       continue;
+               fi
+
+               if ! eeh_can_break $vf ; then
+                       log "skipping "
+
+                       echo 0 > "$pf_sysfs/sriov_numvfs"
+                       continue;
+               fi
+
+               vfs="$((vfs + 1))"
+               echo $vf
+       done
+
+       test "$vfs" != 0
+       return $?
+}
+
+eeh_disable_vfs() {
+       pf_list="$(eeh_find_all_pfs)"
+       if [ -z "$pf_list" ] ; then
+               return 1;
+       fi
+
+       for dev in $pf_list ; do
+               echo 0 > "/sys/bus/pci/devices/$dev/sriov_numvfs"
+       done
+
+       return 0;
+}
diff --git a/tools/testing/selftests/powerpc/eeh/eeh-vf-aware.sh b/tools/testing/selftests/powerpc/eeh/eeh-vf-aware.sh
new file mode 100755 (executable)
index 0000000..874c119
--- /dev/null
@@ -0,0 +1,45 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+
+. ./eeh-functions.sh
+
+eeh_test_prep # NB: may exit
+
+vf_list="$(eeh_enable_vfs)";
+if $? != 0 ; then
+       log "No usable VFs found. Skipping EEH unaware VF test"
+       exit $KSELFTESTS_SKIP;
+fi
+
+log "Enabled VFs: $vf_list"
+
+tested=0
+passed=0
+for vf in $vf_list ; do
+       log "Testing $vf"
+
+       if ! eeh_can_recover $vf ; then
+               log "Driver for $vf doesn't support error recovery, skipping"
+               continue;
+       fi
+
+       tested="$((tested + 1))"
+
+       log "Breaking $vf..."
+       if ! eeh_one_dev $vf ; then
+               log "$vf failed to recover"
+               continue;
+       fi
+
+       passed="$((passed + 1))"
+done
+
+eeh_disable_vfs
+
+if [ "$tested" == 0 ] ; then
+       echo "No VFs with EEH aware drivers found, skipping"
+       exit $KSELFTESTS_SKIP
+fi
+
+test "$failed" != 0
+exit $?;
diff --git a/tools/testing/selftests/powerpc/eeh/eeh-vf-unaware.sh b/tools/testing/selftests/powerpc/eeh/eeh-vf-unaware.sh
new file mode 100755 (executable)
index 0000000..8a4c147
--- /dev/null
@@ -0,0 +1,35 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+
+. ./eeh-functions.sh
+
+eeh_test_prep # NB: may exit
+
+vf_list="$(eeh_enable_vfs)";
+if $? != 0 ; then
+       log "No usable VFs found. Skipping EEH unaware VF test"
+       exit $KSELFTESTS_SKIP;
+fi
+
+log "Enabled VFs: $vf_list"
+
+failed=0
+for vf in $vf_list ; do
+       log "Testing $vf"
+
+       if eeh_can_recover $vf ; then
+               log "Driver for $vf supports error recovery. Unbinding..."
+               echo "$vf" > /sys/bus/pci/devices/$vf/driver/unbind
+       fi
+
+       log "Breaking $vf..."
+       if ! eeh_one_dev $vf ; then
+               log "$vf failed to recover"
+               failed="$((failed + 1))"
+       fi
+done
+
+eeh_disable_vfs
+
+test "$failed" != 0
+exit $?;