KVM: stats: Separate generic stats from architecture specific ones
[linux-2.6-microblaze.git] / arch / powerpc / kvm / book3s_hv.c
index 28a80d2..cd544a4 100644 (file)
@@ -76,6 +76,7 @@
 #include <asm/kvm_book3s_uvmem.h>
 #include <asm/ultravisor.h>
 #include <asm/dtl.h>
+#include <asm/plpar_wrappers.h>
 
 #include "book3s.h"
 
@@ -103,13 +104,9 @@ static int target_smt_mode;
 module_param(target_smt_mode, int, 0644);
 MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
 
-static bool indep_threads_mode = true;
-module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)");
-
 static bool one_vm_per_core;
 module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires indep_threads_mode=N)");
+MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires POWER8 or older)");
 
 #ifdef CONFIG_KVM_XICS
 static const struct kernel_param_ops module_param_ops = {
@@ -134,9 +131,6 @@ static inline bool nesting_enabled(struct kvm *kvm)
        return kvm->arch.nested_enable && kvm_is_radix(kvm);
 }
 
-/* If set, the threads on each CPU core have to be in the same MMU mode */
-static bool no_mixing_hpt_and_radix __read_mostly;
-
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
 /*
@@ -236,7 +230,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 
        waitp = kvm_arch_vcpu_get_wait(vcpu);
        if (rcuwait_wake_up(waitp))
-               ++vcpu->stat.halt_wakeup;
+               ++vcpu->stat.generic.halt_wakeup;
 
        cpu = READ_ONCE(vcpu->arch.thread_cpu);
        if (cpu >= 0 && kvmppc_ipi_thread(cpu))
@@ -807,7 +801,8 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags,
                 * KVM does not support mflags=2 (AIL=2) and AIL=1 is reserved.
                 * Keep this in synch with kvmppc_filter_guest_lpcr_hv.
                 */
-               if (mflags != 0 && mflags != 3)
+               if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) &&
+                               kvmhv_vcpu_is_radix(vcpu) && mflags == 3)
                        return H_UNSUPPORTED_FLAG_START;
                return H_TOO_HARD;
        default:
@@ -899,6 +894,10 @@ static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
         * H_SUCCESS if the source vcore wasn't idle (e.g. if it may
         * have useful work to do and should not confer) so we don't
         * recheck that here.
+        *
+        * In the case of the P9 single vcpu per vcore case, the real
+        * mode handler is not called but no other threads are in the
+        * source vcore.
         */
 
        spin_lock(&vcore->lock);
@@ -924,8 +923,71 @@ static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu)
        return yield_count;
 }
 
+/*
+ * H_RPT_INVALIDATE hcall handler for nested guests.
+ *
+ * Handles only nested process-scoped invalidation requests in L0.
+ */
+static int kvmppc_nested_h_rpt_invalidate(struct kvm_vcpu *vcpu)
+{
+       unsigned long type = kvmppc_get_gpr(vcpu, 6);
+       unsigned long pid, pg_sizes, start, end;
+
+       /*
+        * The partition-scoped invalidations aren't handled here in L0.
+        */
+       if (type & H_RPTI_TYPE_NESTED)
+               return RESUME_HOST;
+
+       pid = kvmppc_get_gpr(vcpu, 4);
+       pg_sizes = kvmppc_get_gpr(vcpu, 7);
+       start = kvmppc_get_gpr(vcpu, 8);
+       end = kvmppc_get_gpr(vcpu, 9);
+
+       do_h_rpt_invalidate_prt(pid, vcpu->arch.nested->shadow_lpid,
+                               type, pg_sizes, start, end);
+
+       kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
+       return RESUME_GUEST;
+}
+
+static long kvmppc_h_rpt_invalidate(struct kvm_vcpu *vcpu,
+                                   unsigned long id, unsigned long target,
+                                   unsigned long type, unsigned long pg_sizes,
+                                   unsigned long start, unsigned long end)
+{
+       if (!kvm_is_radix(vcpu->kvm))
+               return H_UNSUPPORTED;
+
+       if (end < start)
+               return H_P5;
+
+       /*
+        * Partition-scoped invalidation for nested guests.
+        */
+       if (type & H_RPTI_TYPE_NESTED) {
+               if (!nesting_enabled(vcpu->kvm))
+                       return H_FUNCTION;
+
+               /* Support only cores as target */
+               if (target != H_RPTI_TARGET_CMMU)
+                       return H_P2;
+
+               return do_h_rpt_invalidate_pat(vcpu, id, type, pg_sizes,
+                                              start, end);
+       }
+
+       /*
+        * Process-scoped invalidation for L1 guests.
+        */
+       do_h_rpt_invalidate_prt(id, vcpu->kvm->arch.lpid,
+                               type, pg_sizes, start, end);
+       return H_SUCCESS;
+}
+
 int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 {
+       struct kvm *kvm = vcpu->kvm;
        unsigned long req = kvmppc_get_gpr(vcpu, 3);
        unsigned long target, ret = H_SUCCESS;
        int yield_count;
@@ -937,11 +999,57 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
                return RESUME_HOST;
 
        switch (req) {
+       case H_REMOVE:
+               ret = kvmppc_h_remove(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                       kvmppc_get_gpr(vcpu, 5),
+                                       kvmppc_get_gpr(vcpu, 6));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
+       case H_ENTER:
+               ret = kvmppc_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                       kvmppc_get_gpr(vcpu, 5),
+                                       kvmppc_get_gpr(vcpu, 6),
+                                       kvmppc_get_gpr(vcpu, 7));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
+       case H_READ:
+               ret = kvmppc_h_read(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                       kvmppc_get_gpr(vcpu, 5));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
+       case H_CLEAR_MOD:
+               ret = kvmppc_h_clear_mod(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                       kvmppc_get_gpr(vcpu, 5));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
+       case H_CLEAR_REF:
+               ret = kvmppc_h_clear_ref(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                       kvmppc_get_gpr(vcpu, 5));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
+       case H_PROTECT:
+               ret = kvmppc_h_protect(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                       kvmppc_get_gpr(vcpu, 5),
+                                       kvmppc_get_gpr(vcpu, 6));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
+       case H_BULK_REMOVE:
+               ret = kvmppc_h_bulk_remove(vcpu);
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
+
        case H_CEDE:
                break;
        case H_PROD:
                target = kvmppc_get_gpr(vcpu, 4);
-               tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
+               tvcpu = kvmppc_find_vcpu(kvm, target);
                if (!tvcpu) {
                        ret = H_PARAMETER;
                        break;
@@ -955,7 +1063,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
                target = kvmppc_get_gpr(vcpu, 4);
                if (target == -1)
                        break;
-               tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
+               tvcpu = kvmppc_find_vcpu(kvm, target);
                if (!tvcpu) {
                        ret = H_PARAMETER;
                        break;
@@ -971,12 +1079,12 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
                                        kvmppc_get_gpr(vcpu, 6));
                break;
        case H_RTAS:
-               if (list_empty(&vcpu->kvm->arch.rtas_tokens))
+               if (list_empty(&kvm->arch.rtas_tokens))
                        return RESUME_HOST;
 
-               idx = srcu_read_lock(&vcpu->kvm->srcu);
+               idx = srcu_read_lock(&kvm->srcu);
                rc = kvmppc_rtas_hcall(vcpu);
-               srcu_read_unlock(&vcpu->kvm->srcu, idx);
+               srcu_read_unlock(&kvm->srcu, idx);
 
                if (rc == -ENOENT)
                        return RESUME_HOST;
@@ -1060,15 +1168,23 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
                if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4]))
                        ret = H_HARDWARE;
                break;
+       case H_RPT_INVALIDATE:
+               ret = kvmppc_h_rpt_invalidate(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                             kvmppc_get_gpr(vcpu, 5),
+                                             kvmppc_get_gpr(vcpu, 6),
+                                             kvmppc_get_gpr(vcpu, 7),
+                                             kvmppc_get_gpr(vcpu, 8),
+                                             kvmppc_get_gpr(vcpu, 9));
+               break;
 
        case H_SET_PARTITION_TABLE:
                ret = H_FUNCTION;
-               if (nesting_enabled(vcpu->kvm))
+               if (nesting_enabled(kvm))
                        ret = kvmhv_set_partition_table(vcpu);
                break;
        case H_ENTER_NESTED:
                ret = H_FUNCTION;
-               if (!nesting_enabled(vcpu->kvm))
+               if (!nesting_enabled(kvm))
                        break;
                ret = kvmhv_enter_nested_guest(vcpu);
                if (ret == H_INTERRUPT) {
@@ -1083,12 +1199,12 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
                break;
        case H_TLB_INVALIDATE:
                ret = H_FUNCTION;
-               if (nesting_enabled(vcpu->kvm))
+               if (nesting_enabled(kvm))
                        ret = kvmhv_do_nested_tlbie(vcpu);
                break;
        case H_COPY_TOFROM_GUEST:
                ret = H_FUNCTION;
-               if (nesting_enabled(vcpu->kvm))
+               if (nesting_enabled(kvm))
                        ret = kvmhv_copy_tofrom_guest_nested(vcpu);
                break;
        case H_PAGE_INIT:
@@ -1099,7 +1215,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
        case H_SVM_PAGE_IN:
                ret = H_UNSUPPORTED;
                if (kvmppc_get_srr1(vcpu) & MSR_S)
-                       ret = kvmppc_h_svm_page_in(vcpu->kvm,
+                       ret = kvmppc_h_svm_page_in(kvm,
                                                   kvmppc_get_gpr(vcpu, 4),
                                                   kvmppc_get_gpr(vcpu, 5),
                                                   kvmppc_get_gpr(vcpu, 6));
@@ -1107,7 +1223,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
        case H_SVM_PAGE_OUT:
                ret = H_UNSUPPORTED;
                if (kvmppc_get_srr1(vcpu) & MSR_S)
-                       ret = kvmppc_h_svm_page_out(vcpu->kvm,
+                       ret = kvmppc_h_svm_page_out(kvm,
                                                    kvmppc_get_gpr(vcpu, 4),
                                                    kvmppc_get_gpr(vcpu, 5),
                                                    kvmppc_get_gpr(vcpu, 6));
@@ -1115,12 +1231,12 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
        case H_SVM_INIT_START:
                ret = H_UNSUPPORTED;
                if (kvmppc_get_srr1(vcpu) & MSR_S)
-                       ret = kvmppc_h_svm_init_start(vcpu->kvm);
+                       ret = kvmppc_h_svm_init_start(kvm);
                break;
        case H_SVM_INIT_DONE:
                ret = H_UNSUPPORTED;
                if (kvmppc_get_srr1(vcpu) & MSR_S)
-                       ret = kvmppc_h_svm_init_done(vcpu->kvm);
+                       ret = kvmppc_h_svm_init_done(kvm);
                break;
        case H_SVM_INIT_ABORT:
                /*
@@ -1130,24 +1246,26 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
                 * Instead the kvm->arch.secure_guest flag is checked inside
                 * kvmppc_h_svm_init_abort().
                 */
-               ret = kvmppc_h_svm_init_abort(vcpu->kvm);
+               ret = kvmppc_h_svm_init_abort(kvm);
                break;
 
        default:
                return RESUME_HOST;
        }
+       WARN_ON_ONCE(ret == H_TOO_HARD);
        kvmppc_set_gpr(vcpu, 3, ret);
        vcpu->arch.hcall_needed = 0;
        return RESUME_GUEST;
 }
 
 /*
- * Handle H_CEDE in the nested virtualization case where we haven't
- * called the real-mode hcall handlers in book3s_hv_rmhandlers.S.
+ * Handle H_CEDE in the P9 path where we don't call the real-mode hcall
+ * handlers in book3s_hv_rmhandlers.S.
+ *
  * This has to be done early, not in kvmppc_pseries_do_hcall(), so
  * that the cede logic in kvmppc_run_single_vcpu() works properly.
  */
-static void kvmppc_nested_cede(struct kvm_vcpu *vcpu)
+static void kvmppc_cede(struct kvm_vcpu *vcpu)
 {
        vcpu->arch.shregs.msr |= MSR_EE;
        vcpu->arch.ceded = 1;
@@ -1178,6 +1296,7 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd)
        case H_XIRR_X:
 #endif
        case H_PAGE_INIT:
+       case H_RPT_INVALIDATE:
                return 1;
        }
 
@@ -1400,13 +1519,39 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
        }
        case BOOK3S_INTERRUPT_SYSCALL:
        {
-               /* hcall - punt to userspace */
                int i;
 
-               /* hypercall with MSR_PR has already been handled in rmode,
-                * and never reaches here.
-                */
+               if (unlikely(vcpu->arch.shregs.msr & MSR_PR)) {
+                       /*
+                        * Guest userspace executed sc 1. This can only be
+                        * reached by the P9 path because the old path
+                        * handles this case in realmode hcall handlers.
+                        */
+                       if (!kvmhv_vcpu_is_radix(vcpu)) {
+                               /*
+                                * A guest could be running PR KVM, so this
+                                * may be a PR KVM hcall. It must be reflected
+                                * to the guest kernel as a sc interrupt.
+                                */
+                               kvmppc_core_queue_syscall(vcpu);
+                       } else {
+                               /*
+                                * Radix guests can not run PR KVM or nested HV
+                                * hash guests which might run PR KVM, so this
+                                * is always a privilege fault. Send a program
+                                * check to guest kernel.
+                                */
+                               kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
+                       }
+                       r = RESUME_GUEST;
+                       break;
+               }
 
+               /*
+                * hcall - gather args and set exit_reason. This will next be
+                * handled by kvmppc_pseries_do_hcall which may be able to deal
+                * with it and resume guest, or may punt to userspace.
+                */
                run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
                for (i = 0; i < 9; ++i)
                        run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
@@ -1419,20 +1564,102 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
         * We get these next two if the guest accesses a page which it thinks
         * it has mapped but which is not actually present, either because
         * it is for an emulated I/O device or because the corresonding
-        * host page has been paged out.  Any other HDSI/HISI interrupts
-        * have been handled already.
+        * host page has been paged out.
+        *
+        * Any other HDSI/HISI interrupts have been handled already for P7/8
+        * guests. For POWER9 hash guests not using rmhandlers, basic hash
+        * fault handling is done here.
         */
-       case BOOK3S_INTERRUPT_H_DATA_STORAGE:
-               r = RESUME_PAGE_FAULT;
+       case BOOK3S_INTERRUPT_H_DATA_STORAGE: {
+               unsigned long vsid;
+               long err;
+
+               if (vcpu->arch.fault_dsisr == HDSISR_CANARY) {
+                       r = RESUME_GUEST; /* Just retry if it's the canary */
+                       break;
+               }
+
+               if (kvm_is_radix(vcpu->kvm) || !cpu_has_feature(CPU_FTR_ARCH_300)) {
+                       /*
+                        * Radix doesn't require anything, and pre-ISAv3.0 hash
+                        * already attempted to handle this in rmhandlers. The
+                        * hash fault handling below is v3 only (it uses ASDR
+                        * via fault_gpa).
+                        */
+                       r = RESUME_PAGE_FAULT;
+                       break;
+               }
+
+               if (!(vcpu->arch.fault_dsisr & (DSISR_NOHPTE | DSISR_PROTFAULT))) {
+                       kvmppc_core_queue_data_storage(vcpu,
+                               vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+                       r = RESUME_GUEST;
+                       break;
+               }
+
+               if (!(vcpu->arch.shregs.msr & MSR_DR))
+                       vsid = vcpu->kvm->arch.vrma_slb_v;
+               else
+                       vsid = vcpu->arch.fault_gpa;
+
+               err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
+                               vsid, vcpu->arch.fault_dsisr, true);
+               if (err == 0) {
+                       r = RESUME_GUEST;
+               } else if (err == -1 || err == -2) {
+                       r = RESUME_PAGE_FAULT;
+               } else {
+                       kvmppc_core_queue_data_storage(vcpu,
+                               vcpu->arch.fault_dar, err);
+                       r = RESUME_GUEST;
+               }
                break;
-       case BOOK3S_INTERRUPT_H_INST_STORAGE:
+       }
+       case BOOK3S_INTERRUPT_H_INST_STORAGE: {
+               unsigned long vsid;
+               long err;
+
                vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
                vcpu->arch.fault_dsisr = vcpu->arch.shregs.msr &
                        DSISR_SRR1_MATCH_64S;
-               if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
-                       vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
-               r = RESUME_PAGE_FAULT;
+               if (kvm_is_radix(vcpu->kvm) || !cpu_has_feature(CPU_FTR_ARCH_300)) {
+                       /*
+                        * Radix doesn't require anything, and pre-ISAv3.0 hash
+                        * already attempted to handle this in rmhandlers. The
+                        * hash fault handling below is v3 only (it uses ASDR
+                        * via fault_gpa).
+                        */
+                       if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
+                               vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
+                       r = RESUME_PAGE_FAULT;
+                       break;
+               }
+
+               if (!(vcpu->arch.fault_dsisr & SRR1_ISI_NOPT)) {
+                       kvmppc_core_queue_inst_storage(vcpu,
+                               vcpu->arch.fault_dsisr);
+                       r = RESUME_GUEST;
+                       break;
+               }
+
+               if (!(vcpu->arch.shregs.msr & MSR_IR))
+                       vsid = vcpu->kvm->arch.vrma_slb_v;
+               else
+                       vsid = vcpu->arch.fault_gpa;
+
+               err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
+                               vsid, vcpu->arch.fault_dsisr, false);
+               if (err == 0) {
+                       r = RESUME_GUEST;
+               } else if (err == -1) {
+                       r = RESUME_PAGE_FAULT;
+               } else {
+                       kvmppc_core_queue_inst_storage(vcpu, err);
+                       r = RESUME_GUEST;
+               }
                break;
+       }
+
        /*
         * This occurs if the guest executes an illegal instruction.
         * If the guest debug is disabled, generate a program interrupt
@@ -1593,6 +1820,23 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
                if (!xics_on_xive())
                        kvmppc_xics_rm_complete(vcpu, 0);
                break;
+       case BOOK3S_INTERRUPT_SYSCALL:
+       {
+               unsigned long req = kvmppc_get_gpr(vcpu, 3);
+
+               /*
+                * The H_RPT_INVALIDATE hcalls issued by nested
+                * guests for process-scoped invalidations when
+                * GTSE=0, are handled here in L0.
+                */
+               if (req == H_RPT_INVALIDATE) {
+                       r = kvmppc_nested_h_rpt_invalidate(vcpu);
+                       break;
+               }
+
+               r = RESUME_HOST;
+               break;
+       }
        default:
                r = RESUME_HOST;
                break;
@@ -1654,6 +1898,14 @@ unsigned long kvmppc_filter_lpcr_hv(struct kvm *kvm, unsigned long lpcr)
                lpcr &= ~LPCR_AIL;
        if ((lpcr & LPCR_AIL) != LPCR_AIL_3)
                lpcr &= ~LPCR_AIL; /* LPCR[AIL]=1/2 is disallowed */
+       /*
+        * On some POWER9s we force AIL off for radix guests to prevent
+        * executing in MSR[HV]=1 mode with the MMU enabled and PIDR set to
+        * guest, which can result in Q0 translations with LPID=0 PID=PIDR to
+        * be cached, which the host TLB management does not expect.
+        */
+       if (kvm_is_radix(kvm) && cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
+               lpcr &= ~LPCR_AIL;
 
        /*
         * On POWER9, allow userspace to enable large decrementer for the
@@ -2233,7 +2485,7 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
  */
 static int threads_per_vcore(struct kvm *kvm)
 {
-       if (kvm->arch.threads_indep)
+       if (cpu_has_feature(CPU_FTR_ARCH_300))
                return 1;
        return threads_per_subcore;
 }
@@ -2657,7 +2909,7 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
        cpumask_t *cpu_in_guest;
        int i;
 
-       cpu = cpu_first_thread_sibling(cpu);
+       cpu = cpu_first_tlb_thread_sibling(cpu);
        if (nested) {
                cpumask_set_cpu(cpu, &nested->need_tlb_flush);
                cpu_in_guest = &nested->cpu_in_guest;
@@ -2671,9 +2923,10 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
         * the other side is the first smp_mb() in kvmppc_run_core().
         */
        smp_mb();
-       for (i = 0; i < threads_per_core; ++i)
-               if (cpumask_test_cpu(cpu + i, cpu_in_guest))
-                       smp_call_function_single(cpu + i, do_nothing, NULL, 1);
+       for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
+                                       i += cpu_tlb_thread_sibling_step())
+               if (cpumask_test_cpu(i, cpu_in_guest))
+                       smp_call_function_single(i, do_nothing, NULL, 1);
 }
 
 static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
@@ -2704,8 +2957,8 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
         */
        if (prev_cpu != pcpu) {
                if (prev_cpu >= 0 &&
-                   cpu_first_thread_sibling(prev_cpu) !=
-                   cpu_first_thread_sibling(pcpu))
+                   cpu_first_tlb_thread_sibling(prev_cpu) !=
+                   cpu_first_tlb_thread_sibling(pcpu))
                        radix_flush_cpu(kvm, prev_cpu, vcpu);
                if (nested)
                        nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
@@ -2967,9 +3220,6 @@ static void prepare_threads(struct kvmppc_vcore *vc)
        for_each_runnable_thread(i, vcpu, vc) {
                if (signal_pending(vcpu->arch.run_task))
                        vcpu->arch.ret = -EINTR;
-               else if (no_mixing_hpt_and_radix &&
-                        kvm_is_radix(vc->kvm) != radix_enabled())
-                       vcpu->arch.ret = -EINVAL;
                else if (vcpu->arch.vpa.update_pending ||
                         vcpu->arch.slb_shadow.update_pending ||
                         vcpu->arch.dtl.update_pending)
@@ -3176,6 +3426,9 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        int trap;
        bool is_power8;
 
+       if (WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300)))
+               return;
+
        /*
         * Remove from the list any threads that have a signal pending
         * or need a VPA update done
@@ -3203,9 +3456,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
         * Make sure we are running on primary threads, and that secondary
         * threads are offline.  Also check if the number of threads in this
         * guest are greater than the current system threads per guest.
-        * On POWER9, we need to be not in independent-threads mode if
-        * this is a HPT guest on a radix host machine where the
-        * CPU threads may not be in different MMU modes.
         */
        if ((controlled_threads > 1) &&
            ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
@@ -3229,18 +3479,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        if (vc->num_threads < target_threads)
                collect_piggybacks(&core_info, target_threads);
 
-       /*
-        * On radix, arrange for TLB flushing if necessary.
-        * This has to be done before disabling interrupts since
-        * it uses smp_call_function().
-        */
-       pcpu = smp_processor_id();
-       if (kvm_is_radix(vc->kvm)) {
-               for (sub = 0; sub < core_info.n_subcores; ++sub)
-                       for_each_runnable_thread(i, vcpu, core_info.vc[sub])
-                               kvmppc_prepare_radix_vcpu(vcpu, pcpu);
-       }
-
        /*
         * Hard-disable interrupts, and check resched flag and signals.
         * If we need to reschedule or deliver a signal, clean up
@@ -3273,8 +3511,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        cmd_bit = stat_bit = 0;
        split = core_info.n_subcores;
        sip = NULL;
-       is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S)
-               && !cpu_has_feature(CPU_FTR_ARCH_300);
+       is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S);
 
        if (split > 1) {
                sip = &split_info;
@@ -3478,184 +3715,113 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        trace_kvmppc_run_core(vc, 1);
 }
 
-/*
- * Load up hypervisor-mode registers on P9.
- */
-static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
-                                    unsigned long lpcr)
+static void load_spr_state(struct kvm_vcpu *vcpu)
 {
-       struct kvmppc_vcore *vc = vcpu->arch.vcore;
-       s64 hdec;
-       u64 tb, purr, spurr;
-       int trap;
-       unsigned long host_hfscr = mfspr(SPRN_HFSCR);
-       unsigned long host_ciabr = mfspr(SPRN_CIABR);
-       unsigned long host_dawr0 = mfspr(SPRN_DAWR0);
-       unsigned long host_dawrx0 = mfspr(SPRN_DAWRX0);
-       unsigned long host_psscr = mfspr(SPRN_PSSCR);
-       unsigned long host_pidr = mfspr(SPRN_PID);
-       unsigned long host_dawr1 = 0;
-       unsigned long host_dawrx1 = 0;
-
-       if (cpu_has_feature(CPU_FTR_DAWR1)) {
-               host_dawr1 = mfspr(SPRN_DAWR1);
-               host_dawrx1 = mfspr(SPRN_DAWRX1);
-       }
+       mtspr(SPRN_DSCR, vcpu->arch.dscr);
+       mtspr(SPRN_IAMR, vcpu->arch.iamr);
+       mtspr(SPRN_PSPB, vcpu->arch.pspb);
+       mtspr(SPRN_FSCR, vcpu->arch.fscr);
+       mtspr(SPRN_TAR, vcpu->arch.tar);
+       mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
+       mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
+       mtspr(SPRN_BESCR, vcpu->arch.bescr);
+       mtspr(SPRN_WORT, vcpu->arch.wort);
+       mtspr(SPRN_TIDR, vcpu->arch.tid);
+       mtspr(SPRN_AMR, vcpu->arch.amr);
+       mtspr(SPRN_UAMOR, vcpu->arch.uamor);
 
        /*
-        * P8 and P9 suppress the HDEC exception when LPCR[HDICE] = 0,
-        * so set HDICE before writing HDEC.
+        * DAR, DSISR, and for nested HV, SPRGs must be set with MSR[RI]
+        * clear (or hstate set appropriately to catch those registers
+        * being clobbered if we take a MCE or SRESET), so those are done
+        * later.
         */
-       mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr | LPCR_HDICE);
-       isync();
-
-       hdec = time_limit - mftb();
-       if (hdec < 0) {
-               mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
-               isync();
-               return BOOK3S_INTERRUPT_HV_DECREMENTER;
-       }
-       mtspr(SPRN_HDEC, hdec);
-
-       if (vc->tb_offset) {
-               u64 new_tb = mftb() + vc->tb_offset;
-               mtspr(SPRN_TBU40, new_tb);
-               tb = mftb();
-               if ((tb & 0xffffff) < (new_tb & 0xffffff))
-                       mtspr(SPRN_TBU40, new_tb + 0x1000000);
-               vc->tb_offset_applied = vc->tb_offset;
-       }
-
-       if (vc->pcr)
-               mtspr(SPRN_PCR, vc->pcr | PCR_MASK);
-       mtspr(SPRN_DPDES, vc->dpdes);
-       mtspr(SPRN_VTB, vc->vtb);
-
-       local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
-       local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR);
-       mtspr(SPRN_PURR, vcpu->arch.purr);
-       mtspr(SPRN_SPURR, vcpu->arch.spurr);
-
-       if (dawr_enabled()) {
-               mtspr(SPRN_DAWR0, vcpu->arch.dawr0);
-               mtspr(SPRN_DAWRX0, vcpu->arch.dawrx0);
-               if (cpu_has_feature(CPU_FTR_DAWR1)) {
-                       mtspr(SPRN_DAWR1, vcpu->arch.dawr1);
-                       mtspr(SPRN_DAWRX1, vcpu->arch.dawrx1);
-               }
-       }
-       mtspr(SPRN_CIABR, vcpu->arch.ciabr);
-       mtspr(SPRN_IC, vcpu->arch.ic);
-       mtspr(SPRN_PID, vcpu->arch.pid);
 
-       mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
-             (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
-
-       mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
-
-       mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0);
-       mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1);
-       mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2);
-       mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3);
-
-       mtspr(SPRN_AMOR, ~0UL);
-
-       mtspr(SPRN_LPCR, lpcr);
-       isync();
-
-       kvmppc_xive_push_vcpu(vcpu);
-
-       mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0);
-       mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1);
-
-       trap = __kvmhv_vcpu_entry_p9(vcpu);
-
-       /* Advance host PURR/SPURR by the amount used by guest */
-       purr = mfspr(SPRN_PURR);
-       spurr = mfspr(SPRN_SPURR);
-       mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr +
-             purr - vcpu->arch.purr);
-       mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr +
-             spurr - vcpu->arch.spurr);
-       vcpu->arch.purr = purr;
-       vcpu->arch.spurr = spurr;
+       if (!(vcpu->arch.ctrl & 1))
+               mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1);
+}
 
-       vcpu->arch.ic = mfspr(SPRN_IC);
-       vcpu->arch.pid = mfspr(SPRN_PID);
-       vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS;
+static void store_spr_state(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
 
-       vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0);
-       vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1);
-       vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
-       vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
+       vcpu->arch.iamr = mfspr(SPRN_IAMR);
+       vcpu->arch.pspb = mfspr(SPRN_PSPB);
+       vcpu->arch.fscr = mfspr(SPRN_FSCR);
+       vcpu->arch.tar = mfspr(SPRN_TAR);
+       vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
+       vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
+       vcpu->arch.bescr = mfspr(SPRN_BESCR);
+       vcpu->arch.wort = mfspr(SPRN_WORT);
+       vcpu->arch.tid = mfspr(SPRN_TIDR);
+       vcpu->arch.amr = mfspr(SPRN_AMR);
+       vcpu->arch.uamor = mfspr(SPRN_UAMOR);
+       vcpu->arch.dscr = mfspr(SPRN_DSCR);
+}
 
-       /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */
-       mtspr(SPRN_PSSCR, host_psscr |
-             (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
-       mtspr(SPRN_HFSCR, host_hfscr);
-       mtspr(SPRN_CIABR, host_ciabr);
-       mtspr(SPRN_DAWR0, host_dawr0);
-       mtspr(SPRN_DAWRX0, host_dawrx0);
-       if (cpu_has_feature(CPU_FTR_DAWR1)) {
-               mtspr(SPRN_DAWR1, host_dawr1);
-               mtspr(SPRN_DAWRX1, host_dawrx1);
-       }
-       mtspr(SPRN_PID, host_pidr);
+/*
+ * Privileged (non-hypervisor) host registers to save.
+ */
+struct p9_host_os_sprs {
+       unsigned long dscr;
+       unsigned long tidr;
+       unsigned long iamr;
+       unsigned long amr;
+       unsigned long fscr;
+};
 
-       /*
-        * Since this is radix, do a eieio; tlbsync; ptesync sequence in
-        * case we interrupted the guest between a tlbie and a ptesync.
-        */
-       asm volatile("eieio; tlbsync; ptesync");
+static void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs)
+{
+       host_os_sprs->dscr = mfspr(SPRN_DSCR);
+       host_os_sprs->tidr = mfspr(SPRN_TIDR);
+       host_os_sprs->iamr = mfspr(SPRN_IAMR);
+       host_os_sprs->amr = mfspr(SPRN_AMR);
+       host_os_sprs->fscr = mfspr(SPRN_FSCR);
+}
 
-       /*
-        * cp_abort is required if the processor supports local copy-paste
-        * to clear the copy buffer that was under control of the guest.
-        */
-       if (cpu_has_feature(CPU_FTR_ARCH_31))
-               asm volatile(PPC_CP_ABORT);
+/* vcpu guest regs must already be saved */
+static void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu,
+                                   struct p9_host_os_sprs *host_os_sprs)
+{
+       mtspr(SPRN_PSPB, 0);
+       mtspr(SPRN_WORT, 0);
+       mtspr(SPRN_UAMOR, 0);
 
-       mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid);    /* restore host LPID */
-       isync();
+       mtspr(SPRN_DSCR, host_os_sprs->dscr);
+       mtspr(SPRN_TIDR, host_os_sprs->tidr);
+       mtspr(SPRN_IAMR, host_os_sprs->iamr);
 
-       vc->dpdes = mfspr(SPRN_DPDES);
-       vc->vtb = mfspr(SPRN_VTB);
-       mtspr(SPRN_DPDES, 0);
-       if (vc->pcr)
-               mtspr(SPRN_PCR, PCR_MASK);
+       if (host_os_sprs->amr != vcpu->arch.amr)
+               mtspr(SPRN_AMR, host_os_sprs->amr);
 
-       if (vc->tb_offset_applied) {
-               u64 new_tb = mftb() - vc->tb_offset_applied;
-               mtspr(SPRN_TBU40, new_tb);
-               tb = mftb();
-               if ((tb & 0xffffff) < (new_tb & 0xffffff))
-                       mtspr(SPRN_TBU40, new_tb + 0x1000000);
-               vc->tb_offset_applied = 0;
-       }
+       if (host_os_sprs->fscr != vcpu->arch.fscr)
+               mtspr(SPRN_FSCR, host_os_sprs->fscr);
 
-       mtspr(SPRN_HDEC, 0x7fffffff);
-       mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
+       /* Save guest CTRL register, set runlatch to 1 */
+       if (!(vcpu->arch.ctrl & 1))
+               mtspr(SPRN_CTRLT, 1);
+}
 
-       return trap;
+static inline bool hcall_is_xics(unsigned long req)
+{
+       return req == H_EOI || req == H_CPPR || req == H_IPI ||
+               req == H_IPOLL || req == H_XIRR || req == H_XIRR_X;
 }
 
 /*
- * Virtual-mode guest entry for POWER9 and later when the host and
- * guest are both using the radix MMU.  The LPIDR has already been set.
+ * Guest entry for POWER9 and later CPUs.
  */
 static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
                         unsigned long lpcr)
 {
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
-       unsigned long host_dscr = mfspr(SPRN_DSCR);
-       unsigned long host_tidr = mfspr(SPRN_TIDR);
-       unsigned long host_iamr = mfspr(SPRN_IAMR);
-       unsigned long host_amr = mfspr(SPRN_AMR);
-       unsigned long host_fscr = mfspr(SPRN_FSCR);
+       struct p9_host_os_sprs host_os_sprs;
        s64 dec;
        u64 tb;
        int trap, save_pmu;
 
+       WARN_ON_ONCE(vcpu->arch.ceded);
+
        dec = mfspr(SPRN_DEC);
        tb = mftb();
        if (dec < 0)
@@ -3664,7 +3830,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
        if (local_paca->kvm_hstate.dec_expires < time_limit)
                time_limit = local_paca->kvm_hstate.dec_expires;
 
-       vcpu->arch.ceded = 0;
+       save_p9_host_os_sprs(&host_os_sprs);
 
        kvmhv_save_host_pmu();          /* saves it to PACA kvm_hstate */
 
@@ -3693,24 +3859,20 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
 #endif
        mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
 
-       mtspr(SPRN_DSCR, vcpu->arch.dscr);
-       mtspr(SPRN_IAMR, vcpu->arch.iamr);
-       mtspr(SPRN_PSPB, vcpu->arch.pspb);
-       mtspr(SPRN_FSCR, vcpu->arch.fscr);
-       mtspr(SPRN_TAR, vcpu->arch.tar);
-       mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
-       mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
-       mtspr(SPRN_BESCR, vcpu->arch.bescr);
-       mtspr(SPRN_WORT, vcpu->arch.wort);
-       mtspr(SPRN_TIDR, vcpu->arch.tid);
-       mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
-       mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
-       mtspr(SPRN_AMR, vcpu->arch.amr);
-       mtspr(SPRN_UAMOR, vcpu->arch.uamor);
-
-       if (!(vcpu->arch.ctrl & 1))
-               mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1);
+       load_spr_state(vcpu);
 
+       /*
+        * When setting DEC, we must always deal with irq_work_raise via NMI vs
+        * setting DEC. The problem occurs right as we switch into guest mode
+        * if a NMI hits and sets pending work and sets DEC, then that will
+        * apply to the guest and not bring us back to the host.
+        *
+        * irq_work_raise could check a flag (or possibly LPCR[HDICE] for
+        * example) and set HDEC to 1? That wouldn't solve the nested hv
+        * case which needs to abort the hcall or zero the time limit.
+        *
+        * XXX: Another day's problem.
+        */
        mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb());
 
        if (kvmhv_on_pseries()) {
@@ -3718,7 +3880,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
                 * We need to save and restore the guest visible part of the
                 * psscr (i.e. using SPRN_PSSCR_PR) since the hypervisor
                 * doesn't do this for us. Note only required if pseries since
-                * this is done in kvmhv_load_hv_regs_and_go() below otherwise.
+                * this is done in kvmhv_vcpu_entry_p9() below otherwise.
                 */
                unsigned long host_psscr;
                /* call our hypervisor to load up HV regs and go */
@@ -3738,6 +3900,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
                        hvregs.vcpu_token = vcpu->vcpu_id;
                }
                hvregs.hdec_expiry = time_limit;
+               mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
+               mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
                trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
                                          __pa(&vcpu->arch.regs));
                kvmhv_restore_hv_return_state(vcpu, &hvregs);
@@ -3750,15 +3914,41 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
                /* H_CEDE has to be handled now, not later */
                if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
                    kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
-                       kvmppc_nested_cede(vcpu);
+                       kvmppc_cede(vcpu);
                        kvmppc_set_gpr(vcpu, 3, 0);
                        trap = 0;
                }
        } else {
-               trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
+               kvmppc_xive_push_vcpu(vcpu);
+               trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr);
+               if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
+                   !(vcpu->arch.shregs.msr & MSR_PR)) {
+                       unsigned long req = kvmppc_get_gpr(vcpu, 3);
+
+                       /* H_CEDE has to be handled now, not later */
+                       if (req == H_CEDE) {
+                               kvmppc_cede(vcpu);
+                               kvmppc_xive_rearm_escalation(vcpu); /* may un-cede */
+                               kvmppc_set_gpr(vcpu, 3, 0);
+                               trap = 0;
+
+                       /* XICS hcalls must be handled before xive is pulled */
+                       } else if (hcall_is_xics(req)) {
+                               int ret;
+
+                               ret = kvmppc_xive_xics_hcall(vcpu, req);
+                               if (ret != H_TOO_HARD) {
+                                       kvmppc_set_gpr(vcpu, 3, ret);
+                                       trap = 0;
+                               }
+                       }
+               }
+               kvmppc_xive_pull_vcpu(vcpu);
+
+               if (kvm_is_radix(vcpu->kvm))
+                       vcpu->arch.slb_max = 0;
        }
 
-       vcpu->arch.slb_max = 0;
        dec = mfspr(SPRN_DEC);
        if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
                dec = (s32) dec;
@@ -3766,36 +3956,10 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
        vcpu->arch.dec_expires = dec + tb;
        vcpu->cpu = -1;
        vcpu->arch.thread_cpu = -1;
-       /* Save guest CTRL register, set runlatch to 1 */
-       vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
-       if (!(vcpu->arch.ctrl & 1))
-               mtspr(SPRN_CTRLT, vcpu->arch.ctrl | 1);
-
-       vcpu->arch.iamr = mfspr(SPRN_IAMR);
-       vcpu->arch.pspb = mfspr(SPRN_PSPB);
-       vcpu->arch.fscr = mfspr(SPRN_FSCR);
-       vcpu->arch.tar = mfspr(SPRN_TAR);
-       vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
-       vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
-       vcpu->arch.bescr = mfspr(SPRN_BESCR);
-       vcpu->arch.wort = mfspr(SPRN_WORT);
-       vcpu->arch.tid = mfspr(SPRN_TIDR);
-       vcpu->arch.amr = mfspr(SPRN_AMR);
-       vcpu->arch.uamor = mfspr(SPRN_UAMOR);
-       vcpu->arch.dscr = mfspr(SPRN_DSCR);
 
-       mtspr(SPRN_PSPB, 0);
-       mtspr(SPRN_WORT, 0);
-       mtspr(SPRN_UAMOR, 0);
-       mtspr(SPRN_DSCR, host_dscr);
-       mtspr(SPRN_TIDR, host_tidr);
-       mtspr(SPRN_IAMR, host_iamr);
+       store_spr_state(vcpu);
 
-       if (host_amr != vcpu->arch.amr)
-               mtspr(SPRN_AMR, host_amr);
-
-       if (host_fscr != vcpu->arch.fscr)
-               mtspr(SPRN_FSCR, host_fscr);
+       restore_p9_host_os_sprs(vcpu, &host_os_sprs);
 
        msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
        store_fp_state(&vcpu->arch.fp);
@@ -3825,6 +3989,9 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
        vc->in_guest = 0;
 
        mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb());
+       /* We may have raced with new irq work */
+       if (test_irq_work_pending())
+               set_dec(1);
        mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso);
 
        kvmhv_load_host_pmu();
@@ -3925,7 +4092,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
        cur = start_poll = ktime_get();
        if (vc->halt_poll_ns) {
                ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
-               ++vc->runner->stat.halt_attempted_poll;
+               ++vc->runner->stat.generic.halt_attempted_poll;
 
                vc->vcore_state = VCORE_POLLING;
                spin_unlock(&vc->lock);
@@ -3936,13 +4103,13 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
                                break;
                        }
                        cur = ktime_get();
-               } while (single_task_running() && ktime_before(cur, stop));
+               } while (kvm_vcpu_can_poll(cur, stop));
 
                spin_lock(&vc->lock);
                vc->vcore_state = VCORE_INACTIVE;
 
                if (!do_sleep) {
-                       ++vc->runner->stat.halt_successful_poll;
+                       ++vc->runner->stat.generic.halt_successful_poll;
                        goto out;
                }
        }
@@ -3954,7 +4121,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
                do_sleep = 0;
                /* If we polled, count this as a successful poll */
                if (vc->halt_poll_ns)
-                       ++vc->runner->stat.halt_successful_poll;
+                       ++vc->runner->stat.generic.halt_successful_poll;
                goto out;
        }
 
@@ -3981,13 +4148,13 @@ out:
                        ktime_to_ns(cur) - ktime_to_ns(start_wait);
                /* Attribute failed poll time */
                if (vc->halt_poll_ns)
-                       vc->runner->stat.halt_poll_fail_ns +=
+                       vc->runner->stat.generic.halt_poll_fail_ns +=
                                ktime_to_ns(start_wait) -
                                ktime_to_ns(start_poll);
        } else {
                /* Attribute successful poll time */
                if (vc->halt_poll_ns)
-                       vc->runner->stat.halt_poll_success_ns +=
+                       vc->runner->stat.generic.halt_poll_success_ns +=
                                ktime_to_ns(cur) -
                                ktime_to_ns(start_poll);
        }
@@ -4014,7 +4181,6 @@ out:
 /*
  * This never fails for a radix guest, as none of the operations it does
  * for a radix guest can fail or have a way to report failure.
- * kvmhv_run_single_vcpu() relies on this fact.
  */
 static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
 {
@@ -4170,7 +4336,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
 {
        struct kvm_run *run = vcpu->run;
        int trap, r, pcpu;
-       int srcu_idx, lpid;
+       int srcu_idx;
        struct kvmppc_vcore *vc;
        struct kvm *kvm = vcpu->kvm;
        struct kvm_nested_guest *nested = vcpu->arch.nested;
@@ -4193,8 +4359,15 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
        vc->runner = vcpu;
 
        /* See if the MMU is ready to go */
-       if (!kvm->arch.mmu_ready)
-               kvmhv_setup_mmu(vcpu);
+       if (!kvm->arch.mmu_ready) {
+               r = kvmhv_setup_mmu(vcpu);
+               if (r) {
+                       run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+                       run->fail_entry.hardware_entry_failure_reason = 0;
+                       vcpu->arch.ret = r;
+                       return r;
+               }
+       }
 
        if (need_resched())
                cond_resched();
@@ -4207,7 +4380,8 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
        preempt_disable();
        pcpu = smp_processor_id();
        vc->pcpu = pcpu;
-       kvmppc_prepare_radix_vcpu(vcpu, pcpu);
+       if (kvm_is_radix(kvm))
+               kvmppc_prepare_radix_vcpu(vcpu, pcpu);
 
        local_irq_disable();
        hard_irq_disable();
@@ -4244,13 +4418,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
        vc->vcore_state = VCORE_RUNNING;
        trace_kvmppc_run_core(vc, 0);
 
-       if (cpu_has_feature(CPU_FTR_HVMODE)) {
-               lpid = nested ? nested->shadow_lpid : kvm->arch.lpid;
-               mtspr(SPRN_LPID, lpid);
-               isync();
-               kvmppc_check_need_tlb_flush(kvm, pcpu, nested);
-       }
-
        guest_enter_irqoff();
 
        srcu_idx = srcu_read_lock(&kvm->srcu);
@@ -4269,11 +4436,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
 
        srcu_read_unlock(&kvm->srcu, srcu_idx);
 
-       if (cpu_has_feature(CPU_FTR_HVMODE)) {
-               mtspr(SPRN_LPID, kvm->arch.host_lpid);
-               isync();
-       }
-
        set_irq_happened(trap);
 
        kvmppc_set_host_core(pcpu);
@@ -4419,19 +4581,23 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
        vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
 
        do {
-               /*
-                * The TLB prefetch bug fixup is only in the kvmppc_run_vcpu
-                * path, which also handles hash and dependent threads mode.
-                */
-               if (kvm->arch.threads_indep && kvm_is_radix(kvm) &&
-                   !cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
+               if (cpu_has_feature(CPU_FTR_ARCH_300))
                        r = kvmhv_run_single_vcpu(vcpu, ~(u64)0,
                                                  vcpu->arch.vcore->lpcr);
                else
                        r = kvmppc_run_vcpu(vcpu);
 
-               if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
-                   !(vcpu->arch.shregs.msr & MSR_PR)) {
+               if (run->exit_reason == KVM_EXIT_PAPR_HCALL) {
+                       if (WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_PR)) {
+                               /*
+                                * These should have been caught reflected
+                                * into the guest by now. Final sanity check:
+                                * don't allow userspace to execute hcalls in
+                                * the hypervisor.
+                                */
+                               r = RESUME_GUEST;
+                               continue;
+                       }
                        trace_kvm_hcall_enter(vcpu);
                        r = kvmppc_pseries_do_hcall(vcpu);
                        trace_kvm_hcall_exit(vcpu, r);
@@ -4455,7 +4621,6 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
                mtspr(SPRN_EBBRR, ebb_regs[1]);
                mtspr(SPRN_BESCR, ebb_regs[2]);
                mtspr(SPRN_TAR, user_tar);
-               mtspr(SPRN_FSCR, current->thread.fscr);
        }
        mtspr(SPRN_VRSAVE, user_vrsave);
 
@@ -5039,18 +5204,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
        /*
         * Track that we now have a HV mode VM active. This blocks secondary
         * CPU threads from coming online.
-        * On POWER9, we only need to do this if the "indep_threads_mode"
-        * module parameter has been set to N.
         */
-       if (cpu_has_feature(CPU_FTR_ARCH_300)) {
-               if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) {
-                       pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n");
-                       kvm->arch.threads_indep = true;
-               } else {
-                       kvm->arch.threads_indep = indep_threads_mode;
-               }
-       }
-       if (!kvm->arch.threads_indep)
+       if (!cpu_has_feature(CPU_FTR_ARCH_300))
                kvm_hv_vm_activated();
 
        /*
@@ -5091,7 +5246,7 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 {
        debugfs_remove_recursive(kvm->arch.debugfs_dir);
 
-       if (!kvm->arch.threads_indep)
+       if (!cpu_has_feature(CPU_FTR_ARCH_300))
                kvm_hv_vm_deactivated();
 
        kvmppc_free_vcores(kvm);
@@ -5512,7 +5667,9 @@ static int kvmhv_enable_nested(struct kvm *kvm)
 {
        if (!nested)
                return -EPERM;
-       if (!cpu_has_feature(CPU_FTR_ARCH_300) || no_mixing_hpt_and_radix)
+       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+               return -ENODEV;
+       if (!radix_enabled())
                return -ENODEV;
 
        /* kvm == NULL means the caller is testing if the capability exists */
@@ -5675,11 +5832,25 @@ static int kvmhv_enable_dawr1(struct kvm *kvm)
 
 static bool kvmppc_hash_v3_possible(void)
 {
-       if (radix_enabled() && no_mixing_hpt_and_radix)
+       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+               return false;
+
+       if (!cpu_has_feature(CPU_FTR_HVMODE))
                return false;
 
-       return cpu_has_feature(CPU_FTR_ARCH_300) &&
-               cpu_has_feature(CPU_FTR_HVMODE);
+       /*
+        * POWER9 chips before version 2.02 can't have some threads in
+        * HPT mode and some in radix mode on the same core.
+        */
+       if (radix_enabled()) {
+               unsigned int pvr = mfspr(SPRN_PVR);
+               if ((pvr >> 16) == PVR_POWER9 &&
+                   (((pvr & 0xe000) == 0 && (pvr & 0xfff) < 0x202) ||
+                    ((pvr & 0xe000) == 0x2000 && (pvr & 0xfff) < 0x101)))
+                       return false;
+       }
+
+       return true;
 }
 
 static struct kvmppc_ops kvm_ops_hv = {
@@ -5823,18 +5994,6 @@ static int kvmppc_book3s_init_hv(void)
        if (kvmppc_radix_possible())
                r = kvmppc_radix_init();
 
-       /*
-        * POWER9 chips before version 2.02 can't have some threads in
-        * HPT mode and some in radix mode on the same core.
-        */
-       if (cpu_has_feature(CPU_FTR_ARCH_300)) {
-               unsigned int pvr = mfspr(SPRN_PVR);
-               if ((pvr >> 16) == PVR_POWER9 &&
-                   (((pvr & 0xe000) == 0 && (pvr & 0xfff) < 0x202) ||
-                    ((pvr & 0xe000) == 0x2000 && (pvr & 0xfff) < 0x101)))
-                       no_mixing_hpt_and_radix = true;
-       }
-
        r = kvmppc_uvmem_init();
        if (r < 0)
                pr_err("KVM-HV: kvmppc_uvmem_init failed %d\n", r);