Merge branch 'topic/paca' into next
authorMichael Ellerman <mpe@ellerman.id.au>
Fri, 30 Mar 2018 13:11:24 +0000 (00:11 +1100)
committerMichael Ellerman <mpe@ellerman.id.au>
Fri, 30 Mar 2018 22:09:36 +0000 (09:09 +1100)
Bring in yet another series that touches KVM code, and might need to
be merged into the kvm-ppc branch to resolve conflicts.

This required some changes in pnv_power9_force_smt4_catch/release()
due to the paca array becomming an array of pointers.

46 files changed:
arch/powerpc/include/asm/book3s/64/hash.h
arch/powerpc/include/asm/book3s/64/radix.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/include/asm/lppaca.h
arch/powerpc/include/asm/paca.h
arch/powerpc/include/asm/pmc.h
arch/powerpc/include/asm/setup.h
arch/powerpc/include/asm/smp.h
arch/powerpc/include/asm/sparsemem.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/crash.c
arch/powerpc/kernel/head_64.S
arch/powerpc/kernel/machine_kexec_64.c
arch/powerpc/kernel/paca.c
arch/powerpc/kernel/prom.c
arch/powerpc/kernel/setup-common.c
arch/powerpc/kernel/setup.h
arch/powerpc/kernel/setup_64.c
arch/powerpc/kernel/smp.c
arch/powerpc/kernel/sysfs.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_builtin.c
arch/powerpc/kvm/book3s_hv_interrupts.S
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/mm/hash_utils_64.c
arch/powerpc/mm/mem.c
arch/powerpc/mm/numa.c
arch/powerpc/mm/pgtable-book3s64.c
arch/powerpc/mm/pgtable-radix.c
arch/powerpc/mm/tlb-radix.c
arch/powerpc/platforms/85xx/smp.c
arch/powerpc/platforms/cell/smp.c
arch/powerpc/platforms/powernv/idle.c
arch/powerpc/platforms/powernv/setup.c
arch/powerpc/platforms/powernv/smp.c
arch/powerpc/platforms/powernv/subcore.c
arch/powerpc/platforms/pseries/hotplug-cpu.c
arch/powerpc/platforms/pseries/kexec.c
arch/powerpc/platforms/pseries/lpar.c
arch/powerpc/platforms/pseries/setup.c
arch/powerpc/platforms/pseries/smp.c
arch/powerpc/sysdev/mpic.c
arch/powerpc/sysdev/xics/icp-native.c
arch/powerpc/xmon/xmon.c
include/linux/memblock.h
mm/memblock.c

index 935adcd..cc8cd65 100644 (file)
@@ -212,7 +212,7 @@ extern int __meminit hash__vmemmap_create_mapping(unsigned long start,
 extern void hash__vmemmap_remove_mapping(unsigned long start,
                                     unsigned long page_size);
 
-int hash__create_section_mapping(unsigned long start, unsigned long end);
+int hash__create_section_mapping(unsigned long start, unsigned long end, int nid);
 int hash__remove_section_mapping(unsigned long start, unsigned long end);
 
 #endif /* !__ASSEMBLY__ */
index 365010f..705193e 100644 (file)
@@ -313,7 +313,7 @@ static inline unsigned long radix__get_tree_size(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int radix__create_section_mapping(unsigned long start, unsigned long end);
+int radix__create_section_mapping(unsigned long start, unsigned long end, int nid);
 int radix__remove_section_mapping(unsigned long start, unsigned long end);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 #endif /* __ASSEMBLY__ */
index 7765a80..b7d066b 100644 (file)
@@ -436,15 +436,15 @@ struct openpic;
 extern void kvm_cma_reserve(void) __init;
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {
-       paca[cpu].kvm_hstate.xics_phys = (void __iomem *)addr;
+       paca_ptrs[cpu]->kvm_hstate.xics_phys = (void __iomem *)addr;
 }
 
 static inline void kvmppc_set_xive_tima(int cpu,
                                        unsigned long phys_addr,
                                        void __iomem *virt_addr)
 {
-       paca[cpu].kvm_hstate.xive_tima_phys = (void __iomem *)phys_addr;
-       paca[cpu].kvm_hstate.xive_tima_virt = virt_addr;
+       paca_ptrs[cpu]->kvm_hstate.xive_tima_phys = (void __iomem *)phys_addr;
+       paca_ptrs[cpu]->kvm_hstate.xive_tima_virt = virt_addr;
 }
 
 static inline u32 kvmppc_get_xics_latch(void)
@@ -458,7 +458,7 @@ static inline u32 kvmppc_get_xics_latch(void)
 
 static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
 {
-       paca[cpu].kvm_hstate.host_ipi = host_ipi;
+       paca_ptrs[cpu]->kvm_hstate.host_ipi = host_ipi;
 }
 
 static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)
index d0a2a2f..65d5896 100644 (file)
 #include <asm/mmu.h>
 
 /*
- * We only have to have statically allocated lppaca structs on
- * legacy iSeries, which supports at most 64 cpus.
- */
-#define NR_LPPACAS     1
-
-/*
- * The Hypervisor barfs if the lppaca crosses a page boundary.  A 1k
- * alignment is sufficient to prevent this
+ * The lppaca is the "virtual processor area" registered with the hypervisor,
+ * H_REGISTER_VPA etc.
+ *
+ * According to PAPR, the structure is 640 bytes long, must be L1 cache line
+ * aligned, and must not cross a 4kB boundary. Its size field must be at
+ * least 640 bytes (but may be more).
+ *
+ * Pre-v4.14 KVM hypervisors reject the VPA if its size field is smaller than
+ * 1kB, so we dynamically allocate 1kB and advertise size as 1kB, but keep
+ * this structure as the canonical 640 byte size.
  */
 struct lppaca {
        /* cacheline 1 contains read-only data */
@@ -97,13 +99,11 @@ struct lppaca {
 
        __be32  page_ins;               /* CMO Hint - # page ins by OS */
        u8      reserved11[148];
-       volatile __be64 dtl_idx;                /* Dispatch Trace Log head index */
+       volatile __be64 dtl_idx;        /* Dispatch Trace Log head index */
        u8      reserved12[96];
-} __attribute__((__aligned__(0x400)));
-
-extern struct lppaca lppaca[];
+} ____cacheline_aligned;
 
-#define lppaca_of(cpu) (*paca[cpu].lppaca_ptr)
+#define lppaca_of(cpu) (*paca_ptrs[cpu]->lppaca_ptr)
 
 /*
  * We are using a non architected field to determine if a partition is
index c97b411..4185f1c 100644 (file)
@@ -47,7 +47,10 @@ extern unsigned int debug_smp_processor_id(void); /* from linux/smp.h */
 #define get_paca()     local_paca
 #endif
 
+#ifdef CONFIG_PPC_PSERIES
 #define get_lppaca()   (get_paca()->lppaca_ptr)
+#endif
+
 #define get_slb_shadow()       (get_paca()->slb_shadow_ptr)
 
 struct task_struct;
@@ -59,7 +62,7 @@ struct task_struct;
  * processor.
  */
 struct paca_struct {
-#ifdef CONFIG_PPC_BOOK3S
+#ifdef CONFIG_PPC_PSERIES
        /*
         * Because hw_cpu_id, unlike other paca fields, is accessed
         * routinely from other CPUs (from the IRQ code), we stick to
@@ -68,7 +71,8 @@ struct paca_struct {
         */
 
        struct lppaca *lppaca_ptr;      /* Pointer to LpPaca for PLIC */
-#endif /* CONFIG_PPC_BOOK3S */
+#endif /* CONFIG_PPC_PSERIES */
+
        /*
         * MAGIC: the spinlock functions in arch/powerpc/lib/locks.c 
         * load lock_token and paca_index with a single lwz
@@ -161,10 +165,14 @@ struct paca_struct {
        u64 saved_msr;                  /* MSR saved here by enter_rtas */
        u16 trap_save;                  /* Used when bad stack is encountered */
        u8 irq_soft_mask;               /* mask for irq soft masking */
+       u8 soft_enabled;                /* irq soft-enable flag */
        u8 irq_happened;                /* irq happened while soft-disabled */
        u8 io_sync;                     /* writel() needs spin_unlock sync */
        u8 irq_work_pending;            /* IRQ_WORK interrupt while soft-disable */
        u8 nap_state_lost;              /* NV GPR values lost in power7_idle */
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+       u8 pmcregs_in_use;              /* pseries puts this in lppaca */
+#endif
        u64 sprg_vdso;                  /* Saved user-visible sprg */
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
        u64 tm_scratch;                 /* TM scratch area for reclaim */
@@ -244,18 +252,20 @@ struct paca_struct {
        void *rfi_flush_fallback_area;
        u64 l1d_flush_size;
 #endif
-};
+} ____cacheline_aligned;
 
 extern void copy_mm_to_paca(struct mm_struct *mm);
-extern struct paca_struct *paca;
+extern struct paca_struct **paca_ptrs;
 extern void initialise_paca(struct paca_struct *new_paca, int cpu);
 extern void setup_paca(struct paca_struct *new_paca);
-extern void allocate_pacas(void);
+extern void allocate_paca_ptrs(void);
+extern void allocate_paca(int cpu);
 extern void free_unused_pacas(void);
 
 #else /* CONFIG_PPC64 */
 
-static inline void allocate_pacas(void) { };
+static inline void allocate_paca_ptrs(void) { };
+static inline void allocate_paca(int cpu) { };
 static inline void free_unused_pacas(void) { };
 
 #endif /* CONFIG_PPC64 */
index 5a9ede4..7ac3586 100644 (file)
@@ -31,10 +31,21 @@ void ppc_enable_pmcs(void);
 
 #ifdef CONFIG_PPC_BOOK3S_64
 #include <asm/lppaca.h>
+#include <asm/firmware.h>
 
 static inline void ppc_set_pmu_inuse(int inuse)
 {
-       get_lppaca()->pmcregs_in_use = inuse;
+#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE)
+       if (firmware_has_feature(FW_FEATURE_LPAR)) {
+#ifdef CONFIG_PPC_PSERIES
+               get_lppaca()->pmcregs_in_use = inuse;
+#endif
+       } else {
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+               get_paca()->pmcregs_in_use = inuse;
+#endif
+       }
+#endif
 }
 
 extern void power4_enable_pmcs(void);
index bbcdf92..27fa52e 100644 (file)
@@ -23,6 +23,7 @@ extern void reloc_got2(unsigned long);
 #define PTRRELOC(x)    ((typeof(x)) add_reloc_offset((unsigned long)(x)))
 
 void check_for_initrd(void);
+void mem_topology_setup(void);
 void initmem_init(void);
 void setup_panic(void);
 #define ARCH_PANIC_TIMEOUT 180
index fac963e..cfecfee 100644 (file)
@@ -31,6 +31,7 @@
 
 extern int boot_cpuid;
 extern int spinning_secondaries;
+extern u32 *cpu_to_phys_id;
 
 extern void cpu_die(void);
 extern int cpu_to_chip_id(int cpu);
@@ -170,12 +171,12 @@ static inline const struct cpumask *cpu_sibling_mask(int cpu)
 #ifdef CONFIG_PPC64
 static inline int get_hard_smp_processor_id(int cpu)
 {
-       return paca[cpu].hw_cpu_id;
+       return paca_ptrs[cpu]->hw_cpu_id;
 }
 
 static inline void set_hard_smp_processor_id(int cpu, int phys)
 {
-       paca[cpu].hw_cpu_id = phys;
+       paca_ptrs[cpu]->hw_cpu_id = phys;
 }
 #else
 /* 32-bit */
index a7916ee..bc66712 100644 (file)
@@ -17,7 +17,7 @@
 #endif /* CONFIG_SPARSEMEM */
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-extern int create_section_mapping(unsigned long start, unsigned long end);
+extern int create_section_mapping(unsigned long start, unsigned long end, int nid);
 extern int remove_section_mapping(unsigned long start, unsigned long end);
 
 #ifdef CONFIG_PPC_BOOK3S_64
index daf809a..6bee65f 100644 (file)
@@ -221,12 +221,17 @@ int main(void)
        OFFSET(PACA_EXMC, paca_struct, exmc);
        OFFSET(PACA_EXSLB, paca_struct, exslb);
        OFFSET(PACA_EXNMI, paca_struct, exnmi);
+#ifdef CONFIG_PPC_PSERIES
        OFFSET(PACALPPACAPTR, paca_struct, lppaca_ptr);
+#endif
        OFFSET(PACA_SLBSHADOWPTR, paca_struct, slb_shadow_ptr);
        OFFSET(SLBSHADOW_STACKVSID, slb_shadow, save_area[SLB_NUM_BOLTED - 1].vsid);
        OFFSET(SLBSHADOW_STACKESID, slb_shadow, save_area[SLB_NUM_BOLTED - 1].esid);
        OFFSET(SLBSHADOW_SAVEAREA, slb_shadow, save_area);
        OFFSET(LPPACA_PMCINUSE, lppaca, pmcregs_in_use);
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+       OFFSET(PACA_PMCINUSE, paca_struct, pmcregs_in_use);
+#endif
        OFFSET(LPPACA_DTLIDX, lppaca, dtl_idx);
        OFFSET(LPPACA_YIELDCOUNT, lppaca, yield_count);
        OFFSET(PACA_DTL_RIDX, paca_struct, dtl_ridx);
index 00b2151..17c8b99 100644 (file)
@@ -238,7 +238,7 @@ static void __maybe_unused crash_kexec_wait_realmode(int cpu)
                if (i == cpu)
                        continue;
 
-               while (paca[i].kexec_state < KEXEC_STATE_REAL_MODE) {
+               while (paca_ptrs[i]->kexec_state < KEXEC_STATE_REAL_MODE) {
                        barrier();
                        if (!cpu_possible(i) || !cpu_online(i) || (msecs <= 0))
                                break;
index a61151a..6eca15f 100644 (file)
@@ -392,19 +392,20 @@ generic_secondary_common_init:
         * physical cpu id in r24, we need to search the pacas to find
         * which logical id maps to our physical one.
         */
-       LOAD_REG_ADDR(r13, paca)        /* Load paca pointer             */
-       ld      r13,0(r13)              /* Get base vaddr of paca array  */
 #ifndef CONFIG_SMP
-       addi    r13,r13,PACA_SIZE       /* know r13 if used accidentally */
        b       kexec_wait              /* wait for next kernel if !SMP  */
 #else
+       LOAD_REG_ADDR(r8, paca_ptrs)    /* Load paca_ptrs pointe         */
+       ld      r8,0(r8)                /* Get base vaddr of array       */
        LOAD_REG_ADDR(r7, nr_cpu_ids)   /* Load nr_cpu_ids address       */
        lwz     r7,0(r7)                /* also the max paca allocated   */
        li      r5,0                    /* logical cpu id                */
-1:     lhz     r6,PACAHWCPUID(r13)     /* Load HW procid from paca      */
+1:
+       sldi    r9,r5,3                 /* get paca_ptrs[] index from cpu id */
+       ldx     r13,r9,r8               /* r13 = paca_ptrs[cpu id]       */
+       lhz     r6,PACAHWCPUID(r13)     /* Load HW procid from paca      */
        cmpw    r6,r24                  /* Compare to our id             */
        beq     2f
-       addi    r13,r13,PACA_SIZE       /* Loop to next PACA on miss     */
        addi    r5,r5,1
        cmpw    r5,r7                   /* Check if more pacas exist     */
        blt     1b
@@ -756,10 +757,10 @@ _GLOBAL(pmac_secondary_start)
        mtmsrd  r3                      /* RI on */
 
        /* Set up a paca value for this processor. */
-       LOAD_REG_ADDR(r4,paca)          /* Load paca pointer            */
-       ld      r4,0(r4)                /* Get base vaddr of paca array */
-       mulli   r13,r24,PACA_SIZE       /* Calculate vaddr of right paca */
-       add     r13,r13,r4              /* for this processor.          */
+       LOAD_REG_ADDR(r4,paca_ptrs)     /* Load paca pointer            */
+       ld      r4,0(r4)                /* Get base vaddr of paca_ptrs array */
+       sldi    r5,r24,3                /* get paca_ptrs[] index from cpu id */
+       ldx     r13,r5,r4               /* r13 = paca_ptrs[cpu id]       */
        SET_PACA(r13)                   /* Save vaddr of paca in an SPRG*/
 
        /* Mark interrupts soft and hard disabled (they might be enabled
index 49d34d7..1044bf1 100644 (file)
@@ -168,24 +168,25 @@ static void kexec_prepare_cpus_wait(int wait_state)
         * are correctly onlined.  If somehow we start a CPU on boot with RTAS
         * start-cpu, but somehow that CPU doesn't write callin_cpu_map[] in
         * time, the boot CPU will timeout.  If it does eventually execute
-        * stuff, the secondary will start up (paca[].cpu_start was written) and
-        * get into a peculiar state.  If the platform supports
-        * smp_ops->take_timebase(), the secondary CPU will probably be spinning
-        * in there.  If not (i.e. pseries), the secondary will continue on and
-        * try to online itself/idle/etc. If it survives that, we need to find
-        * these possible-but-not-online-but-should-be CPUs and chaperone them
-        * into kexec_smp_wait().
+        * stuff, the secondary will start up (paca_ptrs[]->cpu_start was
+        * written) and get into a peculiar state.
+        * If the platform supports smp_ops->take_timebase(), the secondary CPU
+        * will probably be spinning in there.  If not (i.e. pseries), the
+        * secondary will continue on and try to online itself/idle/etc. If it
+        * survives that, we need to find these
+        * possible-but-not-online-but-should-be CPUs and chaperone them into
+        * kexec_smp_wait().
         */
        for_each_online_cpu(i) {
                if (i == my_cpu)
                        continue;
 
-               while (paca[i].kexec_state < wait_state) {
+               while (paca_ptrs[i]->kexec_state < wait_state) {
                        barrier();
                        if (i != notified) {
                                printk(KERN_INFO "kexec: waiting for cpu %d "
                                       "(physical %d) to enter %i state\n",
-                                      i, paca[i].hw_cpu_id, wait_state);
+                                      i, paca_ptrs[i]->hw_cpu_id, wait_state);
                                notified = i;
                        }
                }
@@ -322,18 +323,24 @@ void default_machine_kexec(struct kimage *image)
        kexec_stack.thread_info.cpu = current_thread_info()->cpu;
 
        /* We need a static PACA, too; copy this CPU's PACA over and switch to
-        * it.  Also poison per_cpu_offset to catch anyone using non-static
-        * data.
+        * it. Also poison per_cpu_offset and NULL lppaca to catch anyone using
+        * non-static data.
         */
        memcpy(&kexec_paca, get_paca(), sizeof(struct paca_struct));
        kexec_paca.data_offset = 0xedeaddeadeeeeeeeUL;
-       paca = (struct paca_struct *)RELOC_HIDE(&kexec_paca, 0) -
-               kexec_paca.paca_index;
+#ifdef CONFIG_PPC_PSERIES
+       kexec_paca.lppaca_ptr = NULL;
+#endif
+       paca_ptrs[kexec_paca.paca_index] = &kexec_paca;
+
        setup_paca(&kexec_paca);
 
-       /* XXX: If anyone does 'dynamic lppacas' this will also need to be
-        * switched to a static version!
+       /*
+        * The lppaca should be unregistered at this point so the HV won't
+        * touch it. In the case of a crash, none of the lppacas are
+        * unregistered so there is not much we can do about it here.
         */
+
        /*
         * On Book3S, the copy must happen with the MMU off if we are either
         * using Radix page tables or we are not in an LPAR since we can
index 2fd563d..0ee3e6d 100644 (file)
 
 #include "setup.h"
 
-#ifdef CONFIG_PPC_BOOK3S
+#ifndef CONFIG_SMP
+#define boot_cpuid 0
+#endif
+
+static void *__init alloc_paca_data(unsigned long size, unsigned long align,
+                               unsigned long limit, int cpu)
+{
+       unsigned long pa;
+       int nid;
+
+       /*
+        * boot_cpuid paca is allocated very early before cpu_to_node is up.
+        * Set bottom-up mode, because the boot CPU should be on node-0,
+        * which will put its paca in the right place.
+        */
+       if (cpu == boot_cpuid) {
+               nid = -1;
+               memblock_set_bottom_up(true);
+       } else {
+               nid = early_cpu_to_node(cpu);
+       }
+
+       pa = memblock_alloc_base_nid(size, align, limit, nid, MEMBLOCK_NONE);
+       if (!pa) {
+               pa = memblock_alloc_base(size, align, limit);
+               if (!pa)
+                       panic("cannot allocate paca data");
+       }
+
+       if (cpu == boot_cpuid)
+               memblock_set_bottom_up(false);
+
+       return __va(pa);
+}
+
+#ifdef CONFIG_PPC_PSERIES
 
 /*
- * The structure which the hypervisor knows about - this structure
- * should not cross a page boundary.  The vpa_init/register_vpa call
- * is now known to fail if the lppaca structure crosses a page
- * boundary.  The lppaca is also used on POWER5 pSeries boxes.
- * The lppaca is 640 bytes long, and cannot readily
- * change since the hypervisor knows its layout, so a 1kB alignment
- * will suffice to ensure that it doesn't cross a page boundary.
+ * See asm/lppaca.h for more detail.
+ *
+ * lppaca structures must must be 1kB in size, L1 cache line aligned,
+ * and not cross 4kB boundary. A 1kB size and 1kB alignment will satisfy
+ * these requirements.
  */
-struct lppaca lppaca[] = {
-       [0 ... (NR_LPPACAS-1)] = {
+static inline void init_lppaca(struct lppaca *lppaca)
+{
+       BUILD_BUG_ON(sizeof(struct lppaca) != 640);
+
+       *lppaca = (struct lppaca) {
                .desc = cpu_to_be32(0xd397d781),        /* "LpPa" */
-               .size = cpu_to_be16(sizeof(struct lppaca)),
+               .size = cpu_to_be16(0x400),
                .fpregs_in_use = 1,
                .slb_count = cpu_to_be16(64),
                .vmxregs_in_use = 0,
-               .page_ins = 0,
-       },
+               .page_ins = 0, };
 };
 
-static struct lppaca *extra_lppacas;
-static long __initdata lppaca_size;
-
-static void __init allocate_lppacas(int nr_cpus, unsigned long limit)
-{
-       if (nr_cpus <= NR_LPPACAS)
-               return;
-
-       lppaca_size = PAGE_ALIGN(sizeof(struct lppaca) *
-                                (nr_cpus - NR_LPPACAS));
-       extra_lppacas = __va(memblock_alloc_base(lppaca_size,
-                                                PAGE_SIZE, limit));
-}
-
-static struct lppaca * __init new_lppaca(int cpu)
+static struct lppaca * __init new_lppaca(int cpu, unsigned long limit)
 {
        struct lppaca *lp;
+       size_t size = 0x400;
 
-       if (cpu < NR_LPPACAS)
-               return &lppaca[cpu];
+       BUILD_BUG_ON(size < sizeof(struct lppaca));
+
+       if (early_cpu_has_feature(CPU_FTR_HVMODE))
+               return NULL;
 
-       lp = extra_lppacas + (cpu - NR_LPPACAS);
-       *lp = lppaca[0];
+       lp = alloc_paca_data(size, 0x400, limit, cpu);
+       init_lppaca(lp);
 
        return lp;
 }
-
-static void __init free_lppacas(void)
-{
-       long new_size = 0, nr;
-
-       if (!lppaca_size)
-               return;
-       nr = num_possible_cpus() - NR_LPPACAS;
-       if (nr > 0)
-               new_size = PAGE_ALIGN(nr * sizeof(struct lppaca));
-       if (new_size >= lppaca_size)
-               return;
-
-       memblock_free(__pa(extra_lppacas) + new_size, lppaca_size - new_size);
-       lppaca_size = new_size;
-}
-
-#else
-
-static inline void allocate_lppacas(int nr_cpus, unsigned long limit) { }
-static inline void free_lppacas(void) { }
-
 #endif /* CONFIG_PPC_BOOK3S */
 
 #ifdef CONFIG_PPC_BOOK3S_64
 
 /*
- * 3 persistent SLBs are registered here.  The buffer will be zero
+ * 3 persistent SLBs are allocated here.  The buffer will be zero
  * initially, hence will all be invaild until we actually write them.
  *
  * If you make the number of persistent SLB entries dynamic, please also
  * update PR KVM to flush and restore them accordingly.
  */
-static struct slb_shadow * __initdata slb_shadow;
-
-static void __init allocate_slb_shadows(int nr_cpus, int limit)
-{
-       int size = PAGE_ALIGN(sizeof(struct slb_shadow) * nr_cpus);
-
-       if (early_radix_enabled())
-               return;
-
-       slb_shadow = __va(memblock_alloc_base(size, PAGE_SIZE, limit));
-       memset(slb_shadow, 0, size);
-}
-
-static struct slb_shadow * __init init_slb_shadow(int cpu)
+static struct slb_shadow * __init new_slb_shadow(int cpu, unsigned long limit)
 {
        struct slb_shadow *s;
 
-       if (early_radix_enabled())
-               return NULL;
-
-       s = &slb_shadow[cpu];
+       if (cpu != boot_cpuid) {
+               /*
+                * Boot CPU comes here before early_radix_enabled
+                * is parsed (e.g., for disable_radix). So allocate
+                * always and this will be fixed up in free_unused_pacas.
+                */
+               if (early_radix_enabled())
+                       return NULL;
+       }
 
-       /*
-        * When we come through here to initialise boot_paca, the slb_shadow
-        * buffers are not allocated yet. That's OK, we'll get one later in
-        * boot, but make sure we don't corrupt memory at 0.
-        */
-       if (!slb_shadow)
-               return NULL;
+       s = alloc_paca_data(sizeof(*s), L1_CACHE_BYTES, limit, cpu);
+       memset(s, 0, sizeof(*s));
 
        s->persistent = cpu_to_be32(SLB_NUM_BOLTED);
        s->buffer_length = cpu_to_be32(sizeof(*s));
@@ -137,10 +126,6 @@ static struct slb_shadow * __init init_slb_shadow(int cpu)
        return s;
 }
 
-#else /* !CONFIG_PPC_BOOK3S_64 */
-
-static void __init allocate_slb_shadows(int nr_cpus, int limit) { }
-
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
 /* The Paca is an array with one entry per processor.  Each contains an
@@ -152,14 +137,15 @@ static void __init allocate_slb_shadows(int nr_cpus, int limit) { }
  * processors.  The processor VPD array needs one entry per physical
  * processor (not thread).
  */
-struct paca_struct *paca;
-EXPORT_SYMBOL(paca);
+struct paca_struct **paca_ptrs __read_mostly;
+EXPORT_SYMBOL(paca_ptrs);
 
 void __init initialise_paca(struct paca_struct *new_paca, int cpu)
 {
-#ifdef CONFIG_PPC_BOOK3S
-       new_paca->lppaca_ptr = new_lppaca(cpu);
-#else
+#ifdef CONFIG_PPC_PSERIES
+       new_paca->lppaca_ptr = NULL;
+#endif
+#ifdef CONFIG_PPC_BOOK3E
        new_paca->kernel_pgd = swapper_pg_dir;
 #endif
        new_paca->lock_token = 0x8000;
@@ -173,7 +159,7 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu)
        new_paca->__current = &init_task;
        new_paca->data_offset = 0xfeeeeeeeeeeeeeeeULL;
 #ifdef CONFIG_PPC_BOOK3S_64
-       new_paca->slb_shadow_ptr = init_slb_shadow(cpu);
+       new_paca->slb_shadow_ptr = NULL;
 #endif
 
 #ifdef CONFIG_PPC_BOOK3E
@@ -203,12 +189,25 @@ void setup_paca(struct paca_struct *new_paca)
 
 }
 
-static int __initdata paca_size;
+static int __initdata paca_nr_cpu_ids;
+static int __initdata paca_ptrs_size;
+static int __initdata paca_struct_size;
+
+void __init allocate_paca_ptrs(void)
+{
+       paca_nr_cpu_ids = nr_cpu_ids;
+
+       paca_ptrs_size = sizeof(struct paca_struct *) * nr_cpu_ids;
+       paca_ptrs = __va(memblock_alloc(paca_ptrs_size, 0));
+       memset(paca_ptrs, 0x88, paca_ptrs_size);
+}
 
-void __init allocate_pacas(void)
+void __init allocate_paca(int cpu)
 {
        u64 limit;
-       int cpu;
+       struct paca_struct *paca;
+
+       BUG_ON(cpu >= paca_nr_cpu_ids);
 
 #ifdef CONFIG_PPC_BOOK3S_64
        /*
@@ -220,40 +219,44 @@ void __init allocate_pacas(void)
        limit = ppc64_rma_size;
 #endif
 
-       paca_size = PAGE_ALIGN(sizeof(struct paca_struct) * nr_cpu_ids);
-
-       paca = __va(memblock_alloc_base(paca_size, PAGE_SIZE, limit));
-       memset(paca, 0, paca_size);
-
-       printk(KERN_DEBUG "Allocated %u bytes for %u pacas at %p\n",
-               paca_size, nr_cpu_ids, paca);
-
-       allocate_lppacas(nr_cpu_ids, limit);
-
-       allocate_slb_shadows(nr_cpu_ids, limit);
+       paca = alloc_paca_data(sizeof(struct paca_struct), L1_CACHE_BYTES,
+                               limit, cpu);
+       paca_ptrs[cpu] = paca;
+       memset(paca, 0, sizeof(struct paca_struct));
 
-       /* Can't use for_each_*_cpu, as they aren't functional yet */
-       for (cpu = 0; cpu < nr_cpu_ids; cpu++)
-               initialise_paca(&paca[cpu], cpu);
+       initialise_paca(paca, cpu);
+#ifdef CONFIG_PPC_PSERIES
+       paca->lppaca_ptr = new_lppaca(cpu, limit);
+#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+       paca->slb_shadow_ptr = new_slb_shadow(cpu, limit);
+#endif
+       paca_struct_size += sizeof(struct paca_struct);
 }
 
 void __init free_unused_pacas(void)
 {
-       int new_size;
-
-       new_size = PAGE_ALIGN(sizeof(struct paca_struct) * nr_cpu_ids);
+       int new_ptrs_size;
 
-       if (new_size >= paca_size)
-               return;
+       new_ptrs_size = sizeof(struct paca_struct *) * nr_cpu_ids;
+       if (new_ptrs_size < paca_ptrs_size)
+               memblock_free(__pa(paca_ptrs) + new_ptrs_size,
+                                       paca_ptrs_size - new_ptrs_size);
 
-       memblock_free(__pa(paca) + new_size, paca_size - new_size);
+       paca_nr_cpu_ids = nr_cpu_ids;
+       paca_ptrs_size = new_ptrs_size;
 
-       printk(KERN_DEBUG "Freed %u bytes for unused pacas\n",
-               paca_size - new_size);
-
-       paca_size = new_size;
+#ifdef CONFIG_PPC_BOOK3S_64
+       if (early_radix_enabled()) {
+               /* Ugly fixup, see new_slb_shadow() */
+               memblock_free(__pa(paca_ptrs[boot_cpuid]->slb_shadow_ptr),
+                               sizeof(struct slb_shadow));
+               paca_ptrs[boot_cpuid]->slb_shadow_ptr = NULL;
+       }
+#endif
 
-       free_lppacas();
+       printk(KERN_DEBUG "Allocated %u bytes for %u pacas\n",
+                       paca_ptrs_size + paca_struct_size, nr_cpu_ids);
 }
 
 void copy_mm_to_paca(struct mm_struct *mm)
index 330c65f..9dbed48 100644 (file)
@@ -365,7 +365,6 @@ static int __init early_init_dt_scan_cpus(unsigned long node,
        DBG("boot cpu: logical %d physical %d\n", found,
            be32_to_cpu(intserv[found_thread]));
        boot_cpuid = found;
-       set_hard_smp_processor_id(found, be32_to_cpu(intserv[found_thread]));
 
        /*
         * PAPR defines "logical" PVR values for cpus that
@@ -403,7 +402,9 @@ static int __init early_init_dt_scan_cpus(unsigned long node,
                cur_cpu_spec->cpu_features &= ~CPU_FTR_SMT;
        else if (!dt_cpu_ftrs_in_use())
                cur_cpu_spec->cpu_features |= CPU_FTR_SMT;
+       allocate_paca(boot_cpuid);
 #endif
+       set_hard_smp_processor_id(found, be32_to_cpu(intserv[found_thread]));
 
        return 0;
 }
@@ -744,7 +745,7 @@ void __init early_init_devtree(void *params)
         * FIXME .. and the initrd too? */
        move_device_tree();
 
-       allocate_pacas();
+       allocate_paca_ptrs();
 
        DBG("Scanning CPUs ...\n");
 
@@ -874,5 +875,15 @@ EXPORT_SYMBOL(cpu_to_chip_id);
 
 bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
 {
+#ifdef CONFIG_SMP
+       /*
+        * Early firmware scanning must use this rather than
+        * get_hard_smp_processor_id because we don't have pacas allocated
+        * until memory topology is discovered.
+        */
+       if (cpu_to_phys_id != NULL)
+               return (int)phys_id == cpu_to_phys_id[cpu];
+#endif
+
        return (int)phys_id == get_hard_smp_processor_id(cpu);
 }
index a6002f9..56f7a2b 100644 (file)
@@ -437,6 +437,8 @@ static void __init cpu_init_thread_core_maps(int tpc)
 }
 
 
+u32 *cpu_to_phys_id = NULL;
+
 /**
  * setup_cpu_maps - initialize the following cpu maps:
  *                  cpu_possible_mask
@@ -463,6 +465,10 @@ void __init smp_setup_cpu_maps(void)
 
        DBG("smp_setup_cpu_maps()\n");
 
+       cpu_to_phys_id = __va(memblock_alloc(nr_cpu_ids * sizeof(u32),
+                                                       __alignof__(u32)));
+       memset(cpu_to_phys_id, 0, nr_cpu_ids * sizeof(u32));
+
        for_each_node_by_type(dn, "cpu") {
                const __be32 *intserv;
                __be32 cpu_be;
@@ -480,6 +486,7 @@ void __init smp_setup_cpu_maps(void)
                        intserv = of_get_property(dn, "reg", &len);
                        if (!intserv) {
                                cpu_be = cpu_to_be32(cpu);
+                               /* XXX: what is this? uninitialized?? */
                                intserv = &cpu_be;      /* assume logical == phys */
                                len = 4;
                        }
@@ -499,8 +506,8 @@ void __init smp_setup_cpu_maps(void)
                                                "enable-method", "spin-table");
 
                        set_cpu_present(cpu, avail);
-                       set_hard_smp_processor_id(cpu, be32_to_cpu(intserv[j]));
                        set_cpu_possible(cpu, true);
+                       cpu_to_phys_id[cpu] = be32_to_cpu(intserv[j]);
                        cpu++;
                }
 
@@ -835,6 +842,23 @@ static __init void print_system_info(void)
        pr_info("-----------------------------------------------------\n");
 }
 
+#ifdef CONFIG_SMP
+static void smp_setup_pacas(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               if (cpu == smp_processor_id())
+                       continue;
+               allocate_paca(cpu);
+               set_hard_smp_processor_id(cpu, cpu_to_phys_id[cpu]);
+       }
+
+       memblock_free(__pa(cpu_to_phys_id), nr_cpu_ids * sizeof(u32));
+       cpu_to_phys_id = NULL;
+}
+#endif
+
 /*
  * Called into from start_kernel this initializes memblock, which is used
  * to manage page allocation until mem_init is called.
@@ -888,6 +912,9 @@ void __init setup_arch(char **cmdline_p)
        /* Check the SMT related command line arguments (ppc64). */
        check_smt_enabled();
 
+       /* Parse memory topology */
+       mem_topology_setup();
+
        /* On BookE, setup per-core TLB data structures. */
        setup_tlb_core_data();
 
@@ -899,6 +926,7 @@ void __init setup_arch(char **cmdline_p)
         * so smp_release_cpus() does nothing for them.
         */
 #ifdef CONFIG_SMP
+       smp_setup_pacas();
        smp_release_cpus();
 #endif
 
index 3fc11e3..d144df5 100644 (file)
@@ -45,14 +45,11 @@ void emergency_stack_init(void);
 static inline void emergency_stack_init(void) { };
 #endif
 
-#ifdef CONFIG_PPC64
-void record_spr_defaults(void);
-#else
-static inline void record_spr_defaults(void) { };
-#endif
-
 #ifdef CONFIG_PPC64
 u64 ppc64_bolted_size(void);
+
+/* Default SPR values from firmware/kexec */
+extern unsigned long spr_default_dscr;
 #endif
 
 /*
index 7f76216..66f2b62 100644 (file)
@@ -110,7 +110,7 @@ void __init setup_tlb_core_data(void)
                if (cpu_first_thread_sibling(boot_cpuid) == first)
                        first = boot_cpuid;
 
-               paca[cpu].tcd_ptr = &paca[first].tcd;
+               paca_ptrs[cpu]->tcd_ptr = &paca_ptrs[first]->tcd;
 
                /*
                 * If we have threads, we need either tlbsrx.
@@ -254,6 +254,14 @@ static void cpu_ready_for_interrupts(void)
        get_paca()->kernel_msr = MSR_KERNEL;
 }
 
+unsigned long spr_default_dscr = 0;
+
+void __init record_spr_defaults(void)
+{
+       if (early_cpu_has_feature(CPU_FTR_DSCR))
+               spr_default_dscr = mfspr(SPRN_DSCR);
+}
+
 /*
  * Early initialization entry point. This is called by head.S
  * with MMU translation disabled. We rely on the "feature" of
@@ -304,7 +312,11 @@ void __init early_setup(unsigned long dt_ptr)
        early_init_devtree(__va(dt_ptr));
 
        /* Now we know the logical id of our boot cpu, setup the paca. */
-       setup_paca(&paca[boot_cpuid]);
+       if (boot_cpuid != 0) {
+               /* Poison paca_ptrs[0] again if it's not the boot cpu */
+               memset(&paca_ptrs[0], 0x88, sizeof(paca_ptrs[0]));
+       }
+       setup_paca(paca_ptrs[boot_cpuid]);
        fixup_boot_paca();
 
        /*
@@ -599,6 +611,21 @@ __init u64 ppc64_bolted_size(void)
 #endif
 }
 
+static void *__init alloc_stack(unsigned long limit, int cpu)
+{
+       unsigned long pa;
+
+       pa = memblock_alloc_base_nid(THREAD_SIZE, THREAD_SIZE, limit,
+                                       early_cpu_to_node(cpu), MEMBLOCK_NONE);
+       if (!pa) {
+               pa = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit);
+               if (!pa)
+                       panic("cannot allocate stacks");
+       }
+
+       return __va(pa);
+}
+
 void __init irqstack_early_init(void)
 {
        u64 limit = ppc64_bolted_size();
@@ -610,12 +637,8 @@ void __init irqstack_early_init(void)
         * accessed in realmode.
         */
        for_each_possible_cpu(i) {
-               softirq_ctx[i] = (struct thread_info *)
-                       __va(memblock_alloc_base(THREAD_SIZE,
-                                           THREAD_SIZE, limit));
-               hardirq_ctx[i] = (struct thread_info *)
-                       __va(memblock_alloc_base(THREAD_SIZE,
-                                           THREAD_SIZE, limit));
+               softirq_ctx[i] = alloc_stack(limit, i);
+               hardirq_ctx[i] = alloc_stack(limit, i);
        }
 }
 
@@ -623,20 +646,21 @@ void __init irqstack_early_init(void)
 void __init exc_lvl_early_init(void)
 {
        unsigned int i;
-       unsigned long sp;
 
        for_each_possible_cpu(i) {
-               sp = memblock_alloc(THREAD_SIZE, THREAD_SIZE);
-               critirq_ctx[i] = (struct thread_info *)__va(sp);
-               paca[i].crit_kstack = __va(sp + THREAD_SIZE);
+               void *sp;
+
+               sp = alloc_stack(ULONG_MAX, i);
+               critirq_ctx[i] = sp;
+               paca_ptrs[i]->crit_kstack = sp + THREAD_SIZE;
 
-               sp = memblock_alloc(THREAD_SIZE, THREAD_SIZE);
-               dbgirq_ctx[i] = (struct thread_info *)__va(sp);
-               paca[i].dbg_kstack = __va(sp + THREAD_SIZE);
+               sp = alloc_stack(ULONG_MAX, i);
+               dbgirq_ctx[i] = sp;
+               paca_ptrs[i]->dbg_kstack = sp + THREAD_SIZE;
 
-               sp = memblock_alloc(THREAD_SIZE, THREAD_SIZE);
-               mcheckirq_ctx[i] = (struct thread_info *)__va(sp);
-               paca[i].mc_kstack = __va(sp + THREAD_SIZE);
+               sp = alloc_stack(ULONG_MAX, i);
+               mcheckirq_ctx[i] = sp;
+               paca_ptrs[i]->mc_kstack = sp + THREAD_SIZE;
        }
 
        if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC))
@@ -690,23 +714,24 @@ void __init emergency_stack_init(void)
 
        for_each_possible_cpu(i) {
                struct thread_info *ti;
-               ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit));
+
+               ti = alloc_stack(limit, i);
                memset(ti, 0, THREAD_SIZE);
                emerg_stack_init_thread_info(ti, i);
-               paca[i].emergency_sp = (void *)ti + THREAD_SIZE;
+               paca_ptrs[i]->emergency_sp = (void *)ti + THREAD_SIZE;
 
 #ifdef CONFIG_PPC_BOOK3S_64
                /* emergency stack for NMI exception handling. */
-               ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit));
+               ti = alloc_stack(limit, i);
                memset(ti, 0, THREAD_SIZE);
                emerg_stack_init_thread_info(ti, i);
-               paca[i].nmi_emergency_sp = (void *)ti + THREAD_SIZE;
+               paca_ptrs[i]->nmi_emergency_sp = (void *)ti + THREAD_SIZE;
 
                /* emergency stack for machine check exception handling. */
-               ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit));
+               ti = alloc_stack(limit, i);
                memset(ti, 0, THREAD_SIZE);
                emerg_stack_init_thread_info(ti, i);
-               paca[i].mc_emergency_sp = (void *)ti + THREAD_SIZE;
+               paca_ptrs[i]->mc_emergency_sp = (void *)ti + THREAD_SIZE;
 #endif
        }
 }
@@ -762,7 +787,7 @@ void __init setup_per_cpu_areas(void)
        delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
        for_each_possible_cpu(cpu) {
                 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
-               paca[cpu].data_offset = __per_cpu_offset[cpu];
+               paca_ptrs[cpu]->data_offset = __per_cpu_offset[cpu];
        }
 }
 #endif
@@ -876,8 +901,9 @@ static void init_fallback_flush(void)
        memset(l1d_flush_fallback_area, 0, l1d_size * 2);
 
        for_each_possible_cpu(cpu) {
-               paca[cpu].rfi_flush_fallback_area = l1d_flush_fallback_area;
-               paca[cpu].l1d_flush_size = l1d_size;
+               struct paca_struct *paca = paca_ptrs[cpu];
+               paca->rfi_flush_fallback_area = l1d_flush_fallback_area;
+               paca->l1d_flush_size = l1d_size;
        }
 }
 
index bbe7634..cfc08b0 100644 (file)
@@ -123,8 +123,8 @@ int smp_generic_kick_cpu(int nr)
         * cpu_start field to become non-zero After we set cpu_start,
         * the processor will continue on to secondary_start
         */
-       if (!paca[nr].cpu_start) {
-               paca[nr].cpu_start = 1;
+       if (!paca_ptrs[nr]->cpu_start) {
+               paca_ptrs[nr]->cpu_start = 1;
                smp_mb();
                return 0;
        }
@@ -657,7 +657,7 @@ void smp_prepare_boot_cpu(void)
 {
        BUG_ON(smp_processor_id() != boot_cpuid);
 #ifdef CONFIG_PPC64
-       paca[boot_cpuid].__current = current;
+       paca_ptrs[boot_cpuid]->__current = current;
 #endif
        set_numa_node(numa_cpu_lookup_table[boot_cpuid]);
        current_set[boot_cpuid] = task_thread_info(current);
@@ -748,8 +748,8 @@ static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle)
        struct thread_info *ti = task_thread_info(idle);
 
 #ifdef CONFIG_PPC64
-       paca[cpu].__current = idle;
-       paca[cpu].kstack = (unsigned long)ti + THREAD_SIZE - STACK_FRAME_OVERHEAD;
+       paca_ptrs[cpu]->__current = idle;
+       paca_ptrs[cpu]->kstack = (unsigned long)ti + THREAD_SIZE - STACK_FRAME_OVERHEAD;
 #endif
        ti->cpu = cpu;
        secondary_ti = current_set[cpu] = ti;
index 04d0bbd..755dc98 100644 (file)
@@ -20,6 +20,7 @@
 #include <asm/firmware.h>
 
 #include "cacheinfo.h"
+#include "setup.h"
 
 #ifdef CONFIG_PPC64
 #include <asm/paca.h>
@@ -588,21 +589,18 @@ static DEVICE_ATTR(dscr_default, 0600,
 
 static void sysfs_create_dscr_default(void)
 {
-       int err = 0;
-       if (cpu_has_feature(CPU_FTR_DSCR))
-               err = device_create_file(cpu_subsys.dev_root, &dev_attr_dscr_default);
-}
+       if (cpu_has_feature(CPU_FTR_DSCR)) {
+               int err = 0;
+               int cpu;
 
-void __init record_spr_defaults(void)
-{
-       int cpu;
+               dscr_default = spr_default_dscr;
+               for_each_possible_cpu(cpu)
+                       paca_ptrs[cpu]->dscr_default = dscr_default;
 
-       if (cpu_has_feature(CPU_FTR_DSCR)) {
-               dscr_default = mfspr(SPRN_DSCR);
-               for (cpu = 0; cpu < nr_cpu_ids; cpu++)
-                       paca[cpu].dscr_default = dscr_default;
+               err = device_create_file(cpu_subsys.dev_root, &dev_attr_dscr_default);
        }
 }
+
 #endif /* CONFIG_PPC64 */
 
 #ifdef HAS_PPC_PMC_PA6T
index 55c1022..1e1211c 100644 (file)
@@ -170,7 +170,7 @@ static bool kvmppc_ipi_thread(int cpu)
 
 #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
        if (cpu >= 0 && cpu < nr_cpu_ids) {
-               if (paca[cpu].kvm_hstate.xics_phys) {
+               if (paca_ptrs[cpu]->kvm_hstate.xics_phys) {
                        xics_wake_cpu(cpu);
                        return true;
                }
@@ -498,7 +498,8 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
                 * use 640 bytes of the structure though, so we should accept
                 * clients that set a size of 640.
                 */
-               if (len < 640)
+               BUILD_BUG_ON(sizeof(struct lppaca) != 640);
+               if (len < sizeof(struct lppaca))
                        break;
                vpap = &tvcpu->arch.vpa;
                err = 0;
@@ -2157,7 +2158,7 @@ static int kvmppc_grab_hwthread(int cpu)
        struct paca_struct *tpaca;
        long timeout = 10000;
 
-       tpaca = &paca[cpu];
+       tpaca = paca_ptrs[cpu];
 
        /* Ensure the thread won't go into the kernel if it wakes */
        tpaca->kvm_hstate.kvm_vcpu = NULL;
@@ -2190,7 +2191,7 @@ static void kvmppc_release_hwthread(int cpu)
 {
        struct paca_struct *tpaca;
 
-       tpaca = &paca[cpu];
+       tpaca = paca_ptrs[cpu];
        tpaca->kvm_hstate.hwthread_req = 0;
        tpaca->kvm_hstate.kvm_vcpu = NULL;
        tpaca->kvm_hstate.kvm_vcore = NULL;
@@ -2256,7 +2257,7 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
                vcpu->arch.thread_cpu = cpu;
                cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest);
        }
-       tpaca = &paca[cpu];
+       tpaca = paca_ptrs[cpu];
        tpaca->kvm_hstate.kvm_vcpu = vcpu;
        tpaca->kvm_hstate.ptid = cpu - vc->pcpu;
        tpaca->kvm_hstate.fake_suspend = 0;
@@ -2282,7 +2283,7 @@ static void kvmppc_wait_for_nap(int n_threads)
                 * for any threads that still have a non-NULL vcore ptr.
                 */
                for (i = 1; i < n_threads; ++i)
-                       if (paca[cpu + i].kvm_hstate.kvm_vcore)
+                       if (paca_ptrs[cpu + i]->kvm_hstate.kvm_vcore)
                                break;
                if (i == n_threads) {
                        HMT_medium();
@@ -2292,7 +2293,7 @@ static void kvmppc_wait_for_nap(int n_threads)
        }
        HMT_medium();
        for (i = 1; i < n_threads; ++i)
-               if (paca[cpu + i].kvm_hstate.kvm_vcore)
+               if (paca_ptrs[cpu + i]->kvm_hstate.kvm_vcore)
                        pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
 }
 
@@ -2824,9 +2825,11 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        }
 
        for (thr = 0; thr < controlled_threads; ++thr) {
-               paca[pcpu + thr].kvm_hstate.tid = thr;
-               paca[pcpu + thr].kvm_hstate.napping = 0;
-               paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
+               struct paca_struct *paca = paca_ptrs[pcpu + thr];
+
+               paca->kvm_hstate.tid = thr;
+               paca->kvm_hstate.napping = 0;
+               paca->kvm_hstate.kvm_split_mode = sip;
        }
 
        /* Initiate micro-threading (split-core) on POWER8 if required */
@@ -2943,7 +2946,9 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        } else if (hpt_on_radix) {
                /* Wait for all threads to have seen final sync */
                for (thr = 1; thr < controlled_threads; ++thr) {
-                       while (paca[pcpu + thr].kvm_hstate.kvm_split_mode) {
+                       struct paca_struct *paca = paca_ptrs[pcpu + thr];
+
+                       while (paca->kvm_hstate.kvm_split_mode) {
                                HMT_low();
                                barrier();
                        }
@@ -4405,7 +4410,7 @@ static int kvm_init_subcore_bitmap(void)
                int node = cpu_to_node(first_cpu);
 
                /* Ignore if it is already allocated. */
-               if (paca[first_cpu].sibling_subcore_state)
+               if (paca_ptrs[first_cpu]->sibling_subcore_state)
                        continue;
 
                sibling_subcore_state =
@@ -4420,7 +4425,8 @@ static int kvm_init_subcore_bitmap(void)
                for (j = 0; j < threads_per_core; j++) {
                        int cpu = first_cpu + j;
 
-                       paca[cpu].sibling_subcore_state = sibling_subcore_state;
+                       paca_ptrs[cpu]->sibling_subcore_state =
+                                               sibling_subcore_state;
                }
        }
        return 0;
@@ -4447,7 +4453,7 @@ static int kvmppc_book3s_init_hv(void)
 
        /*
         * We need a way of accessing the XICS interrupt controller,
-        * either directly, via paca[cpu].kvm_hstate.xics_phys, or
+        * either directly, via paca_ptrs[cpu]->kvm_hstate.xics_phys, or
         * indirectly, via OPAL.
         */
 #ifdef CONFIG_SMP
index 49a2c78..de18299 100644 (file)
@@ -251,7 +251,7 @@ void kvmhv_rm_send_ipi(int cpu)
            return;
 
        /* Else poke the target with an IPI */
-       xics_phys = paca[cpu].kvm_hstate.xics_phys;
+       xics_phys = paca_ptrs[cpu]->kvm_hstate.xics_phys;
        if (xics_phys)
                __raw_rm_writeb(IPI_PRIORITY, xics_phys + XICS_MFRR);
        else
index dc54373..0e84930 100644 (file)
@@ -79,8 +79,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        li      r5, 0
        mtspr   SPRN_MMCRA, r5
        isync
-       ld      r3, PACALPPACAPTR(r13)  /* is the host using the PMU? */
-       lbz     r5, LPPACA_PMCINUSE(r3)
+       lbz     r5, PACA_PMCINUSE(r13)  /* is the host using the PMU? */
        cmpwi   r5, 0
        beq     31f                     /* skip if not */
        mfspr   r5, SPRN_MMCR1
index af17721..95c616f 100644 (file)
@@ -113,8 +113,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
        mtspr   SPRN_SPRG_VDSO_WRITE,r3
 
        /* Reload the host's PMU registers */
-       ld      r3, PACALPPACAPTR(r13)  /* is the host using the PMU? */
-       lbz     r4, LPPACA_PMCINUSE(r3)
+       lbz     r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */
        cmpwi   r4, 0
        beq     23f                     /* skip if not */
 BEGIN_FTR_SECTION
index 4180b89..7587a2e 100644 (file)
@@ -781,7 +781,7 @@ void resize_hpt_for_hotplug(unsigned long new_mem_size)
        }
 }
 
-int hash__create_section_mapping(unsigned long start, unsigned long end)
+int hash__create_section_mapping(unsigned long start, unsigned long end, int nid)
 {
        int rc = htab_bolt_mapping(start, end, __pa(start),
                                   pgprot_val(PAGE_KERNEL), mmu_linear_psize,
index 85245ef..e2f5025 100644 (file)
@@ -117,7 +117,7 @@ int memory_add_physaddr_to_nid(u64 start)
 }
 #endif
 
-int __weak create_section_mapping(unsigned long start, unsigned long end)
+int __weak create_section_mapping(unsigned long start, unsigned long end, int nid)
 {
        return -ENODEV;
 }
@@ -137,7 +137,7 @@ int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *
        resize_hpt_for_hotplug(memblock_phys_mem_size());
 
        start = (unsigned long)__va(start);
-       rc = create_section_mapping(start, start + size);
+       rc = create_section_mapping(start, start + size, nid);
        if (rc) {
                pr_warn("Unable to create mapping for hot added memory 0x%llx..0x%llx: %d\n",
                        start, start + size, rc);
@@ -212,7 +212,7 @@ walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 EXPORT_SYMBOL_GPL(walk_system_ram_range);
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-void __init initmem_init(void)
+void __init mem_topology_setup(void)
 {
        max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
        min_low_pfn = MEMORY_START >> PAGE_SHIFT;
@@ -224,7 +224,10 @@ void __init initmem_init(void)
         * memblock_regions
         */
        memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
+}
 
+void __init initmem_init(void)
+{
        /* XXX need to clip this if using highmem? */
        sparse_memory_present_with_active_regions(0);
        sparse_init();
index edd8d0b..57a5029 100644 (file)
@@ -831,18 +831,13 @@ out:
        of_node_put(rtas);
 }
 
-void __init initmem_init(void)
+void __init mem_topology_setup(void)
 {
-       int nid, cpu;
-
-       max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
-       max_pfn = max_low_pfn;
+       int cpu;
 
        if (parse_numa_properties())
                setup_nonnuma();
 
-       memblock_dump_all();
-
        /*
         * Modify the set of possible NUMA nodes to reflect information
         * available about the set of online nodes, and the set of nodes
@@ -853,6 +848,23 @@ void __init initmem_init(void)
 
        find_possible_nodes();
 
+       setup_node_to_cpumask_map();
+
+       reset_numa_cpu_lookup_table();
+
+       for_each_present_cpu(cpu)
+               numa_setup_cpu(cpu);
+}
+
+void __init initmem_init(void)
+{
+       int nid;
+
+       max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
+       max_pfn = max_low_pfn;
+
+       memblock_dump_all();
+
        for_each_online_node(nid) {
                unsigned long start_pfn, end_pfn;
 
@@ -863,10 +875,6 @@ void __init initmem_init(void)
 
        sparse_init();
 
-       setup_node_to_cpumask_map();
-
-       reset_numa_cpu_lookup_table();
-
        /*
         * We need the numa_cpu_lookup_table to be accurate for all CPUs,
         * even before we online them, so that we can use cpu_to_{node,mem}
@@ -876,8 +884,6 @@ void __init initmem_init(void)
         */
        cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "powerpc/numa:prepare",
                                  ppc_numa_cpu_prepare, ppc_numa_cpu_dead);
-       for_each_present_cpu(cpu)
-               numa_setup_cpu(cpu);
 }
 
 static int __init early_numa(char *p)
@@ -1105,7 +1111,7 @@ static void setup_cpu_associativity_change_counters(void)
        for_each_possible_cpu(cpu) {
                int i;
                u8 *counts = vphn_cpu_change_counts[cpu];
-               volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
+               volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts;
 
                for (i = 0; i < distance_ref_points_depth; i++)
                        counts[i] = hypervisor_counts[i];
@@ -1131,7 +1137,7 @@ static int update_cpu_associativity_changes_mask(void)
        for_each_possible_cpu(cpu) {
                int i, changed = 0;
                u8 *counts = vphn_cpu_change_counts[cpu];
-               volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
+               volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts;
 
                for (i = 0; i < distance_ref_points_depth; i++) {
                        if (hypervisor_counts[i] != counts[i]) {
index bd6ca74..518518f 100644 (file)
@@ -155,12 +155,12 @@ void mmu_cleanup_all(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int __meminit create_section_mapping(unsigned long start, unsigned long end)
+int __meminit create_section_mapping(unsigned long start, unsigned long end, int nid)
 {
        if (radix_enabled())
-               return radix__create_section_mapping(start, end);
+               return radix__create_section_mapping(start, end, nid);
 
-       return hash__create_section_mapping(start, end);
+       return hash__create_section_mapping(start, end, nid);
 }
 
 int __meminit remove_section_mapping(unsigned long start, unsigned long end)
index ab9db0a..7095384 100644 (file)
@@ -48,20 +48,88 @@ static int native_register_process_table(unsigned long base, unsigned long pg_sz
        return 0;
 }
 
-static __ref void *early_alloc_pgtable(unsigned long size)
+static __ref void *early_alloc_pgtable(unsigned long size, int nid,
+                       unsigned long region_start, unsigned long region_end)
 {
+       unsigned long pa = 0;
        void *pt;
 
-       pt = __va(memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE));
+       if (region_start || region_end) /* has region hint */
+               pa = memblock_alloc_range(size, size, region_start, region_end,
+                                               MEMBLOCK_NONE);
+       else if (nid != -1) /* has node hint */
+               pa = memblock_alloc_base_nid(size, size,
+                                               MEMBLOCK_ALLOC_ANYWHERE,
+                                               nid, MEMBLOCK_NONE);
+
+       if (!pa)
+               pa = memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE);
+
+       BUG_ON(!pa);
+
+       pt = __va(pa);
        memset(pt, 0, size);
 
        return pt;
 }
 
-int radix__map_kernel_page(unsigned long ea, unsigned long pa,
+static int early_map_kernel_page(unsigned long ea, unsigned long pa,
                          pgprot_t flags,
-                         unsigned int map_page_size)
+                         unsigned int map_page_size,
+                         int nid,
+                         unsigned long region_start, unsigned long region_end)
 {
+       unsigned long pfn = pa >> PAGE_SHIFT;
+       pgd_t *pgdp;
+       pud_t *pudp;
+       pmd_t *pmdp;
+       pte_t *ptep;
+
+       pgdp = pgd_offset_k(ea);
+       if (pgd_none(*pgdp)) {
+               pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
+                                               region_start, region_end);
+               pgd_populate(&init_mm, pgdp, pudp);
+       }
+       pudp = pud_offset(pgdp, ea);
+       if (map_page_size == PUD_SIZE) {
+               ptep = (pte_t *)pudp;
+               goto set_the_pte;
+       }
+       if (pud_none(*pudp)) {
+               pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
+                                               region_start, region_end);
+               pud_populate(&init_mm, pudp, pmdp);
+       }
+       pmdp = pmd_offset(pudp, ea);
+       if (map_page_size == PMD_SIZE) {
+               ptep = pmdp_ptep(pmdp);
+               goto set_the_pte;
+       }
+       if (!pmd_present(*pmdp)) {
+               ptep = early_alloc_pgtable(PAGE_SIZE, nid,
+                                               region_start, region_end);
+               pmd_populate_kernel(&init_mm, pmdp, ptep);
+       }
+       ptep = pte_offset_kernel(pmdp, ea);
+
+set_the_pte:
+       set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
+       smp_wmb();
+       return 0;
+}
+
+/*
+ * nid, region_start, and region_end are hints to try to place the page
+ * table memory in the same node or region.
+ */
+static int __map_kernel_page(unsigned long ea, unsigned long pa,
+                         pgprot_t flags,
+                         unsigned int map_page_size,
+                         int nid,
+                         unsigned long region_start, unsigned long region_end)
+{
+       unsigned long pfn = pa >> PAGE_SHIFT;
        pgd_t *pgdp;
        pud_t *pudp;
        pmd_t *pmdp;
@@ -70,61 +138,48 @@ int radix__map_kernel_page(unsigned long ea, unsigned long pa,
         * Make sure task size is correct as per the max adddr
         */
        BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
-       if (slab_is_available()) {
-               pgdp = pgd_offset_k(ea);
-               pudp = pud_alloc(&init_mm, pgdp, ea);
-               if (!pudp)
-                       return -ENOMEM;
-               if (map_page_size == PUD_SIZE) {
-                       ptep = (pte_t *)pudp;
-                       goto set_the_pte;
-               }
-               pmdp = pmd_alloc(&init_mm, pudp, ea);
-               if (!pmdp)
-                       return -ENOMEM;
-               if (map_page_size == PMD_SIZE) {
-                       ptep = pmdp_ptep(pmdp);
-                       goto set_the_pte;
-               }
-               ptep = pte_alloc_kernel(pmdp, ea);
-               if (!ptep)
-                       return -ENOMEM;
-       } else {
-               pgdp = pgd_offset_k(ea);
-               if (pgd_none(*pgdp)) {
-                       pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
-                       BUG_ON(pudp == NULL);
-                       pgd_populate(&init_mm, pgdp, pudp);
-               }
-               pudp = pud_offset(pgdp, ea);
-               if (map_page_size == PUD_SIZE) {
-                       ptep = (pte_t *)pudp;
-                       goto set_the_pte;
-               }
-               if (pud_none(*pudp)) {
-                       pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
-                       BUG_ON(pmdp == NULL);
-                       pud_populate(&init_mm, pudp, pmdp);
-               }
-               pmdp = pmd_offset(pudp, ea);
-               if (map_page_size == PMD_SIZE) {
-                       ptep = pmdp_ptep(pmdp);
-                       goto set_the_pte;
-               }
-               if (!pmd_present(*pmdp)) {
-                       ptep = early_alloc_pgtable(PAGE_SIZE);
-                       BUG_ON(ptep == NULL);
-                       pmd_populate_kernel(&init_mm, pmdp, ptep);
-               }
-               ptep = pte_offset_kernel(pmdp, ea);
+
+       if (unlikely(!slab_is_available()))
+               return early_map_kernel_page(ea, pa, flags, map_page_size,
+                                               nid, region_start, region_end);
+
+       /*
+        * Should make page table allocation functions be able to take a
+        * node, so we can place kernel page tables on the right nodes after
+        * boot.
+        */
+       pgdp = pgd_offset_k(ea);
+       pudp = pud_alloc(&init_mm, pgdp, ea);
+       if (!pudp)
+               return -ENOMEM;
+       if (map_page_size == PUD_SIZE) {
+               ptep = (pte_t *)pudp;
+               goto set_the_pte;
        }
+       pmdp = pmd_alloc(&init_mm, pudp, ea);
+       if (!pmdp)
+               return -ENOMEM;
+       if (map_page_size == PMD_SIZE) {
+               ptep = pmdp_ptep(pmdp);
+               goto set_the_pte;
+       }
+       ptep = pte_alloc_kernel(pmdp, ea);
+       if (!ptep)
+               return -ENOMEM;
 
 set_the_pte:
-       set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, flags));
+       set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
        smp_wmb();
        return 0;
 }
 
+int radix__map_kernel_page(unsigned long ea, unsigned long pa,
+                         pgprot_t flags,
+                         unsigned int map_page_size)
+{
+       return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
+}
+
 #ifdef CONFIG_STRICT_KERNEL_RWX
 void radix__change_memory_range(unsigned long start, unsigned long end,
                                unsigned long clear)
@@ -211,7 +266,8 @@ static inline void __meminit print_mapping(unsigned long start,
 }
 
 static int __meminit create_physical_mapping(unsigned long start,
-                                            unsigned long end)
+                                            unsigned long end,
+                                            int nid)
 {
        unsigned long vaddr, addr, mapping_size = 0;
        pgprot_t prot;
@@ -267,7 +323,7 @@ retry:
                else
                        prot = PAGE_KERNEL;
 
-               rc = radix__map_kernel_page(vaddr, addr, prot, mapping_size);
+               rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
                if (rc)
                        return rc;
        }
@@ -276,7 +332,7 @@ retry:
        return 0;
 }
 
-static void __init radix_init_pgtable(void)
+void __init radix_init_pgtable(void)
 {
        unsigned long rts_field;
        struct memblock_region *reg;
@@ -286,9 +342,16 @@ static void __init radix_init_pgtable(void)
        /*
         * Create the linear mapping, using standard page size for now
         */
-       for_each_memblock(memory, reg)
+       for_each_memblock(memory, reg) {
+               /*
+                * The memblock allocator  is up at this point, so the
+                * page tables will be allocated within the range. No
+                * need or a node (which we don't have yet).
+                */
                WARN_ON(create_physical_mapping(reg->base,
-                                               reg->base + reg->size));
+                                               reg->base + reg->size,
+                                               -1));
+       }
 
        /* Find out how many PID bits are supported */
        if (cpu_has_feature(CPU_FTR_HVMODE)) {
@@ -317,7 +380,7 @@ static void __init radix_init_pgtable(void)
         * host.
         */
        BUG_ON(PRTB_SIZE_SHIFT > 36);
-       process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT);
+       process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
        /*
         * Fill in the process table.
         */
@@ -705,8 +768,8 @@ static int __meminit stop_machine_change_mapping(void *data)
 
        spin_unlock(&init_mm.page_table_lock);
        pte_clear(&init_mm, params->aligned_start, params->pte);
-       create_physical_mapping(params->aligned_start, params->start);
-       create_physical_mapping(params->end, params->aligned_end);
+       create_physical_mapping(params->aligned_start, params->start, -1);
+       create_physical_mapping(params->end, params->aligned_end, -1);
        spin_lock(&init_mm.page_table_lock);
        return 0;
 }
@@ -863,9 +926,9 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
        radix__flush_tlb_kernel_range(start, end);
 }
 
-int __meminit radix__create_section_mapping(unsigned long start, unsigned long end)
+int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid)
 {
-       return create_physical_mapping(start, end);
+       return create_physical_mapping(start, end, nid);
 }
 
 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
@@ -876,14 +939,25 @@ int __meminit radix__remove_section_mapping(unsigned long start, unsigned long e
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
+static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
+                                pgprot_t flags, unsigned int map_page_size,
+                                int nid)
+{
+       return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
+}
+
 int __meminit radix__vmemmap_create_mapping(unsigned long start,
                                      unsigned long page_size,
                                      unsigned long phys)
 {
        /* Create a PTE encoding */
        unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
+       int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
+       int ret;
+
+       ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
+       BUG_ON(ret);
 
-       BUG_ON(radix__map_kernel_page(start, phys, __pgprot(flags), page_size));
        return 0;
 }
 
index 291eab4..a8b178d 100644 (file)
@@ -734,7 +734,7 @@ extern void radix_kvm_prefetch_workaround(struct mm_struct *mm)
                for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) {
                        if (sib == cpu)
                                continue;
-                       if (paca[sib].kvm_hstate.kvm_vcpu)
+                       if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu)
                                flush = true;
                }
                if (flush)
index f51fd35..7e966f4 100644 (file)
@@ -147,7 +147,7 @@ static void qoriq_cpu_kill(unsigned int cpu)
        for (i = 0; i < 500; i++) {
                if (is_cpu_dead(cpu)) {
 #ifdef CONFIG_PPC64
-                       paca[cpu].cpu_start = 0;
+                       paca_ptrs[cpu]->cpu_start = 0;
 #endif
                        return;
                }
@@ -328,7 +328,7 @@ static int smp_85xx_kick_cpu(int nr)
                return ret;
 
 done:
-       paca[nr].cpu_start = 1;
+       paca_ptrs[nr]->cpu_start = 1;
        generic_set_cpu_up(nr);
 
        return ret;
@@ -409,14 +409,14 @@ void mpc85xx_smp_kexec_cpu_down(int crash_shutdown, int secondary)
        }
 
        if (disable_threadbit) {
-               while (paca[disable_cpu].kexec_state < KEXEC_STATE_REAL_MODE) {
+               while (paca_ptrs[disable_cpu]->kexec_state < KEXEC_STATE_REAL_MODE) {
                        barrier();
                        now = mftb();
                        if (!notified && now - start > 1000000) {
                                pr_info("%s/%d: waiting for cpu %d to enter KEXEC_STATE_REAL_MODE (%d)\n",
                                        __func__, smp_processor_id(),
                                        disable_cpu,
-                                       paca[disable_cpu].kexec_state);
+                                       paca_ptrs[disable_cpu]->kexec_state);
                                notified = true;
                        }
                }
index f84d52a..1aeac57 100644 (file)
@@ -83,7 +83,7 @@ static inline int smp_startup_cpu(unsigned int lcpu)
        pcpu = get_hard_smp_processor_id(lcpu);
 
        /* Fixup atomic count: it exited inside IRQ handler. */
-       task_thread_info(paca[lcpu].__current)->preempt_count   = 0;
+       task_thread_info(paca_ptrs[lcpu]->__current)->preempt_count     = 0;
 
        /*
         * If the RTAS start-cpu token does not exist then presume the
@@ -126,7 +126,7 @@ static int smp_cell_kick_cpu(int nr)
         * cpu_start field to become non-zero After we set cpu_start,
         * the processor will continue on to secondary_start
         */
-       paca[nr].cpu_start = 1;
+       paca_ptrs[nr]->cpu_start = 1;
 
        return 0;
 }
index 99a760e..d9e366b 100644 (file)
@@ -81,7 +81,7 @@ static int pnv_save_sprs_for_deep_states(void)
 
        for_each_possible_cpu(cpu) {
                uint64_t pir = get_hard_smp_processor_id(cpu);
-               uint64_t hsprg0_val = (uint64_t)&paca[cpu];
+               uint64_t hsprg0_val = (uint64_t)paca_ptrs[cpu];
 
                rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val);
                if (rc != 0)
@@ -174,12 +174,12 @@ static void pnv_alloc_idle_core_states(void)
                for (j = 0; j < threads_per_core; j++) {
                        int cpu = first_cpu + j;
 
-                       paca[cpu].core_idle_state_ptr = core_idle_state;
-                       paca[cpu].thread_idle_state = PNV_THREAD_RUNNING;
-                       paca[cpu].thread_mask = 1 << j;
+                       paca_ptrs[cpu]->core_idle_state_ptr = core_idle_state;
+                       paca_ptrs[cpu]->thread_idle_state = PNV_THREAD_RUNNING;
+                       paca_ptrs[cpu]->thread_mask = 1 << j;
                        if (!cpu_has_feature(CPU_FTR_POWER9_DD1))
                                continue;
-                       paca[cpu].thread_sibling_pacas =
+                       paca_ptrs[cpu]->thread_sibling_pacas =
                                kmalloc_node(paca_ptr_array_size,
                                             GFP_KERNEL, node);
                }
@@ -405,22 +405,20 @@ void power9_idle(void)
 void pnv_power9_force_smt4_catch(void)
 {
        int cpu, cpu0, thr;
-       struct paca_struct *tpaca;
        int awake_threads = 1;          /* this thread is awake */
        int poke_threads = 0;
        int need_awake = threads_per_core;
 
        cpu = smp_processor_id();
        cpu0 = cpu & ~(threads_per_core - 1);
-       tpaca = &paca[cpu0];
        for (thr = 0; thr < threads_per_core; ++thr) {
                if (cpu != cpu0 + thr)
-                       atomic_inc(&tpaca[thr].dont_stop);
+                       atomic_inc(&paca_ptrs[cpu0+thr]->dont_stop);
        }
        /* order setting dont_stop vs testing requested_psscr */
        mb();
        for (thr = 0; thr < threads_per_core; ++thr) {
-               if (!tpaca[thr].requested_psscr)
+               if (!paca_ptrs[cpu0+thr]->requested_psscr)
                        ++awake_threads;
                else
                        poke_threads |= (1 << thr);
@@ -433,14 +431,14 @@ void pnv_power9_force_smt4_catch(void)
                        if (poke_threads & (1 << thr)) {
                                ppc_msgsnd_sync();
                                ppc_msgsnd(PPC_DBELL_MSGTYPE, 0,
-                                          tpaca[thr].hw_cpu_id);
+                                          paca_ptrs[cpu0+thr]->hw_cpu_id);
                        }
                }
                /* now spin until at least 3 threads are awake */
                do {
                        for (thr = 0; thr < threads_per_core; ++thr) {
                                if ((poke_threads & (1 << thr)) &&
-                                   !tpaca[thr].requested_psscr) {
+                                   !paca_ptrs[cpu0+thr]->requested_psscr) {
                                        ++awake_threads;
                                        poke_threads &= ~(1 << thr);
                                }
@@ -453,16 +451,14 @@ EXPORT_SYMBOL_GPL(pnv_power9_force_smt4_catch);
 void pnv_power9_force_smt4_release(void)
 {
        int cpu, cpu0, thr;
-       struct paca_struct *tpaca;
 
        cpu = smp_processor_id();
        cpu0 = cpu & ~(threads_per_core - 1);
-       tpaca = &paca[cpu0];
 
        /* clear all the dont_stop flags */
        for (thr = 0; thr < threads_per_core; ++thr) {
                if (cpu != cpu0 + thr)
-                       atomic_dec(&tpaca[thr].dont_stop);
+                       atomic_dec(&paca_ptrs[cpu0+thr]->dont_stop);
        }
 }
 EXPORT_SYMBOL_GPL(pnv_power9_force_smt4_release);
@@ -830,7 +826,8 @@ static int __init pnv_init_idle_states(void)
                        for (i = 0; i < threads_per_core; i++) {
                                int j = base_cpu + i;
 
-                               paca[j].thread_sibling_pacas[idx] = &paca[cpu];
+                               paca_ptrs[j]->thread_sibling_pacas[idx] =
+                                       paca_ptrs[cpu];
                        }
                }
        }
index 7de050a..5f96328 100644 (file)
@@ -293,7 +293,7 @@ static void pnv_kexec_wait_secondaries_down(void)
                        if (i != notified) {
                                printk(KERN_INFO "kexec: waiting for cpu %d "
                                       "(physical %d) to enter OPAL\n",
-                                      i, paca[i].hw_cpu_id);
+                                      i, paca_ptrs[i]->hw_cpu_id);
                                notified = i;
                        }
 
@@ -305,7 +305,7 @@ static void pnv_kexec_wait_secondaries_down(void)
                        if (timeout-- == 0) {
                                printk(KERN_ERR "kexec: timed out waiting for "
                                       "cpu %d (physical %d) to enter OPAL\n",
-                                      i, paca[i].hw_cpu_id);
+                                      i, paca_ptrs[i]->hw_cpu_id);
                                break;
                        }
                }
index 9664c84..19af6de 100644 (file)
@@ -80,7 +80,7 @@ static int pnv_smp_kick_cpu(int nr)
         * If we already started or OPAL is not supported, we just
         * kick the CPU via the PACA
         */
-       if (paca[nr].cpu_start || !firmware_has_feature(FW_FEATURE_OPAL))
+       if (paca_ptrs[nr]->cpu_start || !firmware_has_feature(FW_FEATURE_OPAL))
                goto kick;
 
        /*
index 596ae2e..4556300 100644 (file)
@@ -280,7 +280,7 @@ void update_subcore_sibling_mask(void)
                int offset = (tid / threads_per_subcore) * threads_per_subcore;
                int mask = sibling_mask_first_cpu << offset;
 
-               paca[cpu].subcore_sibling_mask = mask;
+               paca_ptrs[cpu]->subcore_sibling_mask = mask;
 
        }
 }
index 652d3e9..6ef77ca 100644 (file)
@@ -234,7 +234,7 @@ static void pseries_cpu_die(unsigned int cpu)
         * done here.  Change isolate state to Isolate and
         * change allocation-state to Unusable.
         */
-       paca[cpu].cpu_start = 0;
+       paca_ptrs[cpu]->cpu_start = 0;
 }
 
 /*
index eeb1342..3fe1267 100644 (file)
 
 void pseries_kexec_cpu_down(int crash_shutdown, int secondary)
 {
-       /* Don't risk a hypervisor call if we're crashing */
+       /*
+        * Don't risk a hypervisor call if we're crashing
+        * XXX: Why? The hypervisor is not crashing. It might be better
+        * to at least attempt unregister to avoid the hypervisor stepping
+        * on our memory.
+        */
        if (firmware_has_feature(FW_FEATURE_SPLPAR) && !crash_shutdown) {
                int ret;
                int cpu = smp_processor_id();
index 238b55f..adb996e 100644 (file)
@@ -99,7 +99,7 @@ void vpa_init(int cpu)
         * reports that.  All SPLPAR support SLB shadow buffer.
         */
        if (!radix_enabled() && firmware_has_feature(FW_FEATURE_SPLPAR)) {
-               addr = __pa(paca[cpu].slb_shadow_ptr);
+               addr = __pa(paca_ptrs[cpu]->slb_shadow_ptr);
                ret = register_slb_shadow(hwcpu, addr);
                if (ret)
                        pr_err("WARNING: SLB shadow buffer registration for "
@@ -111,7 +111,7 @@ void vpa_init(int cpu)
        /*
         * Register dispatch trace log, if one has been allocated.
         */
-       pp = &paca[cpu];
+       pp = paca_ptrs[cpu];
        dtl = pp->dispatch_log;
        if (dtl) {
                pp->dtl_ridx = 0;
index b11564f..98bca8d 100644 (file)
@@ -247,7 +247,7 @@ static int alloc_dispatch_logs(void)
                return 0;
 
        for_each_possible_cpu(cpu) {
-               pp = &paca[cpu];
+               pp = paca_ptrs[cpu];
                dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL);
                if (!dtl) {
                        pr_warn("Failed to allocate dispatch trace log for cpu %d\n",
index 66b6f11..3df4612 100644 (file)
@@ -110,7 +110,7 @@ static inline int smp_startup_cpu(unsigned int lcpu)
        }
 
        /* Fixup atomic count: it exited inside IRQ handler. */
-       task_thread_info(paca[lcpu].__current)->preempt_count   = 0;
+       task_thread_info(paca_ptrs[lcpu]->__current)->preempt_count     = 0;
 #ifdef CONFIG_HOTPLUG_CPU
        if (get_cpu_current_state(lcpu) == CPU_STATE_INACTIVE)
                goto out;
@@ -165,7 +165,7 @@ static int smp_pSeries_kick_cpu(int nr)
         * cpu_start field to become non-zero After we set cpu_start,
         * the processor will continue on to secondary_start
         */
-       paca[nr].cpu_start = 1;
+       paca_ptrs[nr]->cpu_start = 1;
 #ifdef CONFIG_HOTPLUG_CPU
        set_preferred_offline_state(nr, CPU_STATE_ONLINE);
 
index 7306780..1d4e0ef 100644 (file)
@@ -626,7 +626,7 @@ static inline u32 mpic_physmask(u32 cpumask)
        int i;
        u32 mask = 0;
 
-       for (i = 0; i < min(32, NR_CPUS); ++i, cpumask >>= 1)
+       for (i = 0; i < min(32, NR_CPUS) && cpu_possible(i); ++i, cpumask >>= 1)
                mask |= (cpumask & 1) << get_hard_smp_processor_id(i);
        return mask;
 }
index 1459f4e..37bfbc5 100644 (file)
@@ -164,7 +164,7 @@ void icp_native_cause_ipi_rm(int cpu)
         * Just like the cause_ipi functions, it is required to
         * include a full barrier before causing the IPI.
         */
-       xics_phys = paca[cpu].kvm_hstate.xics_phys;
+       xics_phys = paca_ptrs[cpu]->kvm_hstate.xics_phys;
        mb();
        __raw_rm_writeb(IPI_PRIORITY, xics_phys + XICS_MFRR);
 }
index 5391802..a0842f1 100644 (file)
@@ -2341,7 +2341,7 @@ static void dump_one_paca(int cpu)
        catch_memory_errors = 1;
        sync();
 
-       p = &paca[cpu];
+       p = paca_ptrs[cpu];
 
        printf("paca for cpu 0x%x @ %px:\n", cpu, p);
 
index 8be5077..4e1e3d0 100644 (file)
@@ -319,6 +319,9 @@ static inline bool memblock_bottom_up(void)
 phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
                                        phys_addr_t start, phys_addr_t end,
                                        ulong flags);
+phys_addr_t memblock_alloc_base_nid(phys_addr_t size,
+                                       phys_addr_t align, phys_addr_t max_addr,
+                                       int nid, ulong flags);
 phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
                                phys_addr_t max_addr);
 phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
index 5a9ca2a..cea2af4 100644 (file)
@@ -1190,7 +1190,7 @@ phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
                                        flags);
 }
 
-static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
+phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
                                        phys_addr_t align, phys_addr_t max_addr,
                                        int nid, ulong flags)
 {