Merge branch 'topic/ppc-kvm' into next
authorMichael Ellerman <mpe@ellerman.id.au>
Thu, 19 May 2022 13:10:42 +0000 (23:10 +1000)
committerMichael Ellerman <mpe@ellerman.id.au>
Thu, 19 May 2022 13:10:42 +0000 (23:10 +1000)
Merge our KVM topic branch.

12 files changed:
1  2 
arch/powerpc/kernel/iommu.c
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_nested.c
arch/powerpc/kvm/book3s_hv_p9_entry.c
arch/powerpc/kvm/book3s_hv_uvmem.c
arch/powerpc/kvm/book3s_xive.c
arch/powerpc/kvm/e500mc.c
arch/powerpc/kvm/powerpc.c
arch/powerpc/mm/init_64.c
arch/powerpc/platforms/powernv/pci-ioda.c
arch/powerpc/platforms/pseries/iommu.c

@@@ -27,6 -27,7 +27,6 @@@
  #include <linux/sched.h>
  #include <linux/debugfs.h>
  #include <asm/io.h>
 -#include <asm/prom.h>
  #include <asm/iommu.h>
  #include <asm/pci-bridge.h>
  #include <asm/machdep.h>
@@@ -1064,7 -1065,7 +1064,7 @@@ extern long iommu_tce_xchg_no_kill(stru
        long ret;
        unsigned long size = 0;
  
-       ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction, false);
+       ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction);
        if (!ret && ((*direction == DMA_FROM_DEVICE) ||
                        (*direction == DMA_BIDIRECTIONAL)) &&
                        !mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift,
@@@ -1079,7 -1080,7 +1079,7 @@@ void iommu_tce_kill(struct iommu_table 
                unsigned long entry, unsigned long pages)
  {
        if (tbl->it_ops->tce_kill)
-               tbl->it_ops->tce_kill(tbl, entry, pages, false);
+               tbl->it_ops->tce_kill(tbl, entry, pages);
  }
  EXPORT_SYMBOL_GPL(iommu_tce_kill);
  
@@@ -58,7 -58,7 +58,7 @@@ struct kvm_resize_hpt 
        /* Possible values and their usage:
         *  <0     an error occurred during allocation,
         *  -EBUSY allocation is in the progress,
 -       *  0      allocation made successfuly.
 +       *  0      allocation made successfully.
         */
        int error;
  
@@@ -256,26 -256,34 +256,34 @@@ void kvmppc_map_vrma(struct kvm_vcpu *v
  
  int kvmppc_mmu_hv_init(void)
  {
-       unsigned long host_lpid, rsvd_lpid;
+       unsigned long nr_lpids;
  
        if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE))
                return -EINVAL;
  
-       host_lpid = 0;
-       if (cpu_has_feature(CPU_FTR_HVMODE))
-               host_lpid = mfspr(SPRN_LPID);
+       if (cpu_has_feature(CPU_FTR_HVMODE)) {
+               if (WARN_ON(mfspr(SPRN_LPID) != 0))
+                       return -EINVAL;
+               nr_lpids = 1UL << mmu_lpid_bits;
+       } else {
+               nr_lpids = 1UL << KVM_MAX_NESTED_GUESTS_SHIFT;
+       }
  
-       /* POWER8 and above have 12-bit LPIDs (10-bit in POWER7) */
-       if (cpu_has_feature(CPU_FTR_ARCH_207S))
-               rsvd_lpid = LPID_RSVD;
-       else
-               rsvd_lpid = LPID_RSVD_POWER7;
+       if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+               /* POWER7 has 10-bit LPIDs, POWER8 has 12-bit LPIDs */
+               if (cpu_has_feature(CPU_FTR_ARCH_207S))
+                       WARN_ON(nr_lpids != 1UL << 12);
+               else
+                       WARN_ON(nr_lpids != 1UL << 10);
  
-       kvmppc_init_lpid(rsvd_lpid + 1);
+               /*
+                * Reserve the last implemented LPID use in partition
+                * switching for POWER7 and POWER8.
+                */
+               nr_lpids -= 1;
+       }
  
-       kvmppc_claim_lpid(host_lpid);
-       /* rsvd_lpid is reserved for use in partition switching */
-       kvmppc_claim_lpid(rsvd_lpid);
+       kvmppc_init_lpid(nr_lpids);
  
        return 0;
  }
@@@ -879,7 -887,7 +887,7 @@@ static bool kvm_age_rmapp(struct kvm *k
        struct revmap_entry *rev = kvm->arch.hpt.rev;
        unsigned long head, i, j;
        __be64 *hptep;
-       int ret = 0;
+       bool ret = false;
        unsigned long *rmapp;
  
        rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
        lock_rmap(rmapp);
        if (*rmapp & KVMPPC_RMAP_REFERENCED) {
                *rmapp &= ~KVMPPC_RMAP_REFERENCED;
-               ret = 1;
+               ret = true;
        }
        if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
                unlock_rmap(rmapp);
                                rev[i].guest_rpte |= HPTE_R_R;
                                note_hpte_modification(kvm, &rev[i]);
                        }
-                       ret = 1;
+                       ret = true;
                }
                __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
        } while ((i = j) != head);
@@@ -42,7 -42,6 +42,7 @@@
  #include <linux/module.h>
  #include <linux/compiler.h>
  #include <linux/of.h>
 +#include <linux/irqdomain.h>
  
  #include <asm/ftrace.h>
  #include <asm/reg.h>
@@@ -1327,6 -1326,12 +1327,12 @@@ static int kvmppc_hcall_impl_hv(unsigne
        case H_CONFER:
        case H_REGISTER_VPA:
        case H_SET_MODE:
+ #ifdef CONFIG_SPAPR_TCE_IOMMU
+       case H_GET_TCE:
+       case H_PUT_TCE:
+       case H_PUT_TCE_INDIRECT:
+       case H_STUFF_TCE:
+ #endif
        case H_LOGICAL_CI_LOAD:
        case H_LOGICAL_CI_STORE:
  #ifdef CONFIG_KVM_XICS
@@@ -2835,7 -2840,7 +2841,7 @@@ static int kvmppc_core_vcpu_create_hv(s
         * to trap and then we emulate them.
         */
        vcpu->arch.hfscr = HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB |
-               HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP | HFSCR_PREFIX;
+               HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP;
        if (cpu_has_feature(CPU_FTR_HVMODE)) {
                vcpu->arch.hfscr &= mfspr(SPRN_HFSCR);
  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
@@@ -3968,6 -3973,7 +3974,7 @@@ static int kvmhv_vcpu_entry_p9_nested(s
  
        kvmhv_save_hv_regs(vcpu, &hvregs);
        hvregs.lpcr = lpcr;
+       hvregs.amor = ~0;
        vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
        hvregs.version = HV_GUEST_STATE_VERSION;
        if (vcpu->arch.nested) {
  static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
                         unsigned long lpcr, u64 *tb)
  {
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_nested_guest *nested = vcpu->arch.nested;
        u64 next_timer;
        int trap;
  
                trap = kvmhv_vcpu_entry_p9_nested(vcpu, time_limit, lpcr, tb);
  
                /* H_CEDE has to be handled now, not later */
-               if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
+               if (trap == BOOK3S_INTERRUPT_SYSCALL && !nested &&
                    kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
                        kvmppc_cede(vcpu);
                        kvmppc_set_gpr(vcpu, 3, 0);
                        trap = 0;
                }
  
-       } else {
-               struct kvm *kvm = vcpu->kvm;
+       } else if (nested) {
+               __this_cpu_write(cpu_in_guest, kvm);
+               trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr, tb);
+               __this_cpu_write(cpu_in_guest, NULL);
  
+       } else {
                kvmppc_xive_push_vcpu(vcpu);
  
                __this_cpu_write(cpu_in_guest, kvm);
                trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr, tb);
                __this_cpu_write(cpu_in_guest, NULL);
  
-               if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
+               if (trap == BOOK3S_INTERRUPT_SYSCALL &&
                    !(vcpu->arch.shregs.msr & MSR_PR)) {
                        unsigned long req = kvmppc_get_gpr(vcpu, 3);
  
-                       /* H_CEDE has to be handled now, not later */
+                       /*
+                        * XIVE rearm and XICS hcalls must be handled
+                        * before xive context is pulled (is this
+                        * true?)
+                        */
                        if (req == H_CEDE) {
+                               /* H_CEDE has to be handled now */
                                kvmppc_cede(vcpu);
-                               kvmppc_xive_rearm_escalation(vcpu); /* may un-cede */
+                               if (!kvmppc_xive_rearm_escalation(vcpu)) {
+                                       /*
+                                        * Pending escalation so abort
+                                        * the cede.
+                                        */
+                                       vcpu->arch.ceded = 0;
+                               }
                                kvmppc_set_gpr(vcpu, 3, 0);
                                trap = 0;
  
-                       /* XICS hcalls must be handled before xive is pulled */
+                       } else if (req == H_ENTER_NESTED) {
+                               /*
+                                * L2 should not run with the L1
+                                * context so rearm and pull it.
+                                */
+                               if (!kvmppc_xive_rearm_escalation(vcpu)) {
+                                       /*
+                                        * Pending escalation so abort
+                                        * H_ENTER_NESTED.
+                                        */
+                                       kvmppc_set_gpr(vcpu, 3, 0);
+                                       trap = 0;
+                               }
                        } else if (hcall_is_xics(req)) {
                                int ret;
  
@@@ -4234,13 -4269,13 +4270,13 @@@ static void kvmppc_vcore_blocked(struc
        start_wait = ktime_get();
  
        vc->vcore_state = VCORE_SLEEPING;
-       trace_kvmppc_vcore_blocked(vc, 0);
+       trace_kvmppc_vcore_blocked(vc->runner, 0);
        spin_unlock(&vc->lock);
        schedule();
        finish_rcuwait(&vc->wait);
        spin_lock(&vc->lock);
        vc->vcore_state = VCORE_INACTIVE;
-       trace_kvmppc_vcore_blocked(vc, 1);
+       trace_kvmppc_vcore_blocked(vc->runner, 1);
        ++vc->runner->stat.halt_successful_wait;
  
        cur = ktime_get();
@@@ -4520,9 -4555,14 +4556,14 @@@ int kvmhv_run_single_vcpu(struct kvm_vc
  
        if (!nested) {
                kvmppc_core_prepare_to_enter(vcpu);
-               if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
-                            &vcpu->arch.pending_exceptions))
+               if (vcpu->arch.shregs.msr & MSR_EE) {
+                       if (xive_interrupt_pending(vcpu))
+                               kvmppc_inject_interrupt_hv(vcpu,
+                                               BOOK3S_INTERRUPT_EXTERNAL, 0);
+               } else if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
+                            &vcpu->arch.pending_exceptions)) {
                        lpcr |= LPCR_MER;
+               }
        } else if (vcpu->arch.pending_exceptions ||
                   vcpu->arch.doorbell_request ||
                   xive_interrupt_pending(vcpu)) {
                        if (kvmppc_vcpu_check_block(vcpu))
                                break;
  
-                       trace_kvmppc_vcore_blocked(vc, 0);
+                       trace_kvmppc_vcore_blocked(vcpu, 0);
                        schedule();
-                       trace_kvmppc_vcore_blocked(vc, 1);
+                       trace_kvmppc_vcore_blocked(vcpu, 1);
                }
                finish_rcuwait(wait);
        }
@@@ -5284,6 -5324,10 +5325,10 @@@ static int kvmppc_core_init_vm_hv(struc
                kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
                lpcr &= LPCR_PECE | LPCR_LPES;
        } else {
+               /*
+                * The L2 LPES mode will be set by the L0 according to whether
+                * or not it needs to take external interrupts in HV mode.
+                */
                lpcr = 0;
        }
        lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
@@@ -261,8 -261,7 +261,7 @@@ static void load_l2_hv_regs(struct kvm_
        /*
         * Don't let L1 change LPCR bits for the L2 except these:
         */
-       mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
-               LPCR_LPES | LPCR_MER;
+       mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | LPCR_MER;
  
        /*
         * Additional filtering is required depending on hardware
@@@ -306,10 -305,10 +305,10 @@@ long kvmhv_enter_nested_guest(struct kv
        /* copy parameters in */
        hv_ptr = kvmppc_get_gpr(vcpu, 4);
        regs_ptr = kvmppc_get_gpr(vcpu, 5);
 -      vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 +      kvm_vcpu_srcu_read_lock(vcpu);
        err = kvmhv_read_guest_state_and_regs(vcpu, &l2_hv, &l2_regs,
                                              hv_ptr, regs_ptr);
 -      srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 +      kvm_vcpu_srcu_read_unlock(vcpu);
        if (err)
                return H_PARAMETER;
  
                byteswap_hv_regs(&l2_hv);
                byteswap_pt_regs(&l2_regs);
        }
 -      vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 +      kvm_vcpu_srcu_read_lock(vcpu);
        err = kvmhv_write_guest_state_and_regs(vcpu, &l2_hv, &l2_regs,
                                               hv_ptr, regs_ptr);
 -      srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 +      kvm_vcpu_srcu_read_unlock(vcpu);
        if (err)
                return H_AUTHORITY;
  
@@@ -439,10 -438,11 +438,11 @@@ long kvmhv_nested_init(void
        if (!radix_enabled())
                return -ENODEV;
  
-       /* find log base 2 of KVMPPC_NR_LPIDS, rounding up */
-       ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1;
-       if (ptb_order < 8)
-               ptb_order = 8;
+       /* Partition table entry is 1<<4 bytes in size, hence the 4. */
+       ptb_order = KVM_MAX_NESTED_GUESTS_SHIFT + 4;
+       /* Minimum partition table size is 1<<12 bytes */
+       if (ptb_order < 12)
+               ptb_order = 12;
        pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
                                       GFP_KERNEL);
        if (!pseries_partition_tb) {
                return -ENOMEM;
        }
  
-       ptcr = __pa(pseries_partition_tb) | (ptb_order - 8);
+       ptcr = __pa(pseries_partition_tb) | (ptb_order - 12);
        rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr);
        if (rc != H_SUCCESS) {
                pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n",
@@@ -521,11 -521,6 +521,6 @@@ static void kvmhv_set_nested_ptbl(struc
        kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table);
  }
  
- void kvmhv_vm_nested_init(struct kvm *kvm)
- {
-       kvm->arch.max_nested_lpid = -1;
- }
  /*
   * Handle the H_SET_PARTITION_TABLE hcall.
   * r4 = guest real address of partition table + log_2(size) - 12
@@@ -539,16 -534,14 +534,14 @@@ long kvmhv_set_partition_table(struct k
        long ret = H_SUCCESS;
  
        srcu_idx = srcu_read_lock(&kvm->srcu);
-       /*
-        * Limit the partition table to 4096 entries (because that's what
-        * hardware supports), and check the base address.
-        */
-       if ((ptcr & PRTS_MASK) > 12 - 8 ||
+       /* Check partition size and base address. */
+       if ((ptcr & PRTS_MASK) + 12 - 4 > KVM_MAX_NESTED_GUESTS_SHIFT ||
            !kvm_is_visible_gfn(vcpu->kvm, (ptcr & PRTB_MASK) >> PAGE_SHIFT))
                ret = H_PARAMETER;
        srcu_read_unlock(&kvm->srcu, srcu_idx);
        if (ret == H_SUCCESS)
                kvm->arch.l1_ptcr = ptcr;
        return ret;
  }
  
@@@ -600,16 -593,16 +593,16 @@@ long kvmhv_copy_tofrom_guest_nested(str
                        goto not_found;
  
                /* Write what was loaded into our buffer back to the L1 guest */
 -              vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 +              kvm_vcpu_srcu_read_lock(vcpu);
                rc = kvm_vcpu_write_guest(vcpu, gp_to, buf, n);
 -              srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 +              kvm_vcpu_srcu_read_unlock(vcpu);
                if (rc)
                        goto not_found;
        } else {
                /* Load the data to be stored from the L1 guest into our buf */
 -              vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 +              kvm_vcpu_srcu_read_lock(vcpu);
                rc = kvm_vcpu_read_guest(vcpu, gp_from, buf, n);
 -              srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 +              kvm_vcpu_srcu_read_unlock(vcpu);
                if (rc)
                        goto not_found;
  
@@@ -644,7 -637,7 +637,7 @@@ static void kvmhv_update_ptbl_cache(str
  
        ret = -EFAULT;
        ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4);
-       if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 8))) {
+       if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4))) {
                int srcu_idx = srcu_read_lock(&kvm->srcu);
                ret = kvm_read_guest(kvm, ptbl_addr,
                                     &ptbl_entry, sizeof(ptbl_entry));
        kvmhv_set_nested_ptbl(gp);
  }
  
+ void kvmhv_vm_nested_init(struct kvm *kvm)
+ {
+       idr_init(&kvm->arch.kvm_nested_guest_idr);
+ }
+ static struct kvm_nested_guest *__find_nested(struct kvm *kvm, int lpid)
+ {
+       return idr_find(&kvm->arch.kvm_nested_guest_idr, lpid);
+ }
+ static bool __prealloc_nested(struct kvm *kvm, int lpid)
+ {
+       if (idr_alloc(&kvm->arch.kvm_nested_guest_idr,
+                               NULL, lpid, lpid + 1, GFP_KERNEL) != lpid)
+               return false;
+       return true;
+ }
+ static void __add_nested(struct kvm *kvm, int lpid, struct kvm_nested_guest *gp)
+ {
+       if (idr_replace(&kvm->arch.kvm_nested_guest_idr, gp, lpid))
+               WARN_ON(1);
+ }
+ static void __remove_nested(struct kvm *kvm, int lpid)
+ {
+       idr_remove(&kvm->arch.kvm_nested_guest_idr, lpid);
+ }
  static struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
  {
        struct kvm_nested_guest *gp;
@@@ -720,13 -742,8 +742,8 @@@ static void kvmhv_remove_nested(struct 
        long ref;
  
        spin_lock(&kvm->mmu_lock);
-       if (gp == kvm->arch.nested_guests[lpid]) {
-               kvm->arch.nested_guests[lpid] = NULL;
-               if (lpid == kvm->arch.max_nested_lpid) {
-                       while (--lpid >= 0 && !kvm->arch.nested_guests[lpid])
-                               ;
-                       kvm->arch.max_nested_lpid = lpid;
-               }
+       if (gp == __find_nested(kvm, lpid)) {
+               __remove_nested(kvm, lpid);
                --gp->refcnt;
        }
        ref = gp->refcnt;
   */
  void kvmhv_release_all_nested(struct kvm *kvm)
  {
-       int i;
+       int lpid;
        struct kvm_nested_guest *gp;
        struct kvm_nested_guest *freelist = NULL;
        struct kvm_memory_slot *memslot;
        int srcu_idx, bkt;
  
        spin_lock(&kvm->mmu_lock);
-       for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
-               gp = kvm->arch.nested_guests[i];
-               if (!gp)
-                       continue;
-               kvm->arch.nested_guests[i] = NULL;
+       idr_for_each_entry(&kvm->arch.kvm_nested_guest_idr, gp, lpid) {
+               __remove_nested(kvm, lpid);
                if (--gp->refcnt == 0) {
                        gp->next = freelist;
                        freelist = gp;
                }
        }
-       kvm->arch.max_nested_lpid = -1;
+       idr_destroy(&kvm->arch.kvm_nested_guest_idr);
+       /* idr is empty and may be reused at this point */
        spin_unlock(&kvm->mmu_lock);
        while ((gp = freelist) != NULL) {
                freelist = gp->next;
@@@ -792,12 -807,11 +807,11 @@@ struct kvm_nested_guest *kvmhv_get_nest
  {
        struct kvm_nested_guest *gp, *newgp;
  
-       if (l1_lpid >= KVM_MAX_NESTED_GUESTS ||
-           l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4)))
+       if (l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4)))
                return NULL;
  
        spin_lock(&kvm->mmu_lock);
-       gp = kvm->arch.nested_guests[l1_lpid];
+       gp = __find_nested(kvm, l1_lpid);
        if (gp)
                ++gp->refcnt;
        spin_unlock(&kvm->mmu_lock);
        newgp = kvmhv_alloc_nested(kvm, l1_lpid);
        if (!newgp)
                return NULL;
+       if (!__prealloc_nested(kvm, l1_lpid)) {
+               kvmhv_release_nested(newgp);
+               return NULL;
+       }
        spin_lock(&kvm->mmu_lock);
-       if (kvm->arch.nested_guests[l1_lpid]) {
-               /* someone else beat us to it */
-               gp = kvm->arch.nested_guests[l1_lpid];
-       } else {
-               kvm->arch.nested_guests[l1_lpid] = newgp;
+       gp = __find_nested(kvm, l1_lpid);
+       if (!gp) {
+               __add_nested(kvm, l1_lpid, newgp);
                ++newgp->refcnt;
                gp = newgp;
                newgp = NULL;
-               if (l1_lpid > kvm->arch.max_nested_lpid)
-                       kvm->arch.max_nested_lpid = l1_lpid;
        }
        ++gp->refcnt;
        spin_unlock(&kvm->mmu_lock);
@@@ -841,20 -857,13 +857,13 @@@ void kvmhv_put_nested(struct kvm_nested
                kvmhv_release_nested(gp);
  }
  
- static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
- {
-       if (lpid > kvm->arch.max_nested_lpid)
-               return NULL;
-       return kvm->arch.nested_guests[lpid];
- }
  pte_t *find_kvm_nested_guest_pte(struct kvm *kvm, unsigned long lpid,
                                 unsigned long ea, unsigned *hshift)
  {
        struct kvm_nested_guest *gp;
        pte_t *pte;
  
-       gp = kvmhv_find_nested(kvm, lpid);
+       gp = __find_nested(kvm, lpid);
        if (!gp)
                return NULL;
  
@@@ -960,7 -969,7 +969,7 @@@ static void kvmhv_remove_nest_rmap(stru
  
        gpa = n_rmap & RMAP_NESTED_GPA_MASK;
        lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT;
-       gp = kvmhv_find_nested(kvm, lpid);
+       gp = __find_nested(kvm, lpid);
        if (!gp)
                return;
  
@@@ -1152,16 -1161,13 +1161,13 @@@ static void kvmhv_emulate_tlbie_all_lpi
  {
        struct kvm *kvm = vcpu->kvm;
        struct kvm_nested_guest *gp;
-       int i;
+       int lpid;
  
        spin_lock(&kvm->mmu_lock);
-       for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
-               gp = kvm->arch.nested_guests[i];
-               if (gp) {
-                       spin_unlock(&kvm->mmu_lock);
-                       kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
-                       spin_lock(&kvm->mmu_lock);
-               }
+       idr_for_each_entry(&kvm->arch.kvm_nested_guest_idr, gp, lpid) {
+               spin_unlock(&kvm->mmu_lock);
+               kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
+               spin_lock(&kvm->mmu_lock);
        }
        spin_unlock(&kvm->mmu_lock);
  }
@@@ -1313,7 -1319,7 +1319,7 @@@ long do_h_rpt_invalidate_pat(struct kvm
         * H_ENTER_NESTED call. Since we can't differentiate this case from
         * the invalid case, we ignore such flush requests and return success.
         */
-       if (!kvmhv_find_nested(vcpu->kvm, lpid))
+       if (!__find_nested(vcpu->kvm, lpid))
                return H_SUCCESS;
  
        /*
@@@ -1657,15 -1663,12 +1663,12 @@@ long int kvmhv_nested_page_fault(struc
  
  int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid)
  {
-       int ret = -1;
+       int ret = lpid + 1;
  
        spin_lock(&kvm->mmu_lock);
-       while (++lpid <= kvm->arch.max_nested_lpid) {
-               if (kvm->arch.nested_guests[lpid]) {
-                       ret = lpid;
-                       break;
-               }
-       }
+       if (!idr_get_next(&kvm->arch.kvm_nested_guest_idr, &ret))
+               ret = -1;
        spin_unlock(&kvm->mmu_lock);
        return ret;
  }
@@@ -379,7 -379,7 +379,7 @@@ void restore_p9_host_os_sprs(struct kvm
  {
        /*
         * current->thread.xxx registers must all be restored to host
 -       * values before a potential context switch, othrewise the context
 +       * values before a potential context switch, otherwise the context
         * switch itself will overwrite current->thread.xxx with the values
         * from the guest SPRs.
         */
@@@ -539,8 -539,10 +539,10 @@@ static void switch_mmu_to_guest_radix(s
  {
        struct kvm_nested_guest *nested = vcpu->arch.nested;
        u32 lpid;
+       u32 pid;
  
        lpid = nested ? nested->shadow_lpid : kvm->arch.lpid;
+       pid = vcpu->arch.pid;
  
        /*
         * Prior memory accesses to host PID Q3 must be completed before we
        isync();
        mtspr(SPRN_LPID, lpid);
        mtspr(SPRN_LPCR, lpcr);
-       mtspr(SPRN_PID, vcpu->arch.pid);
+       mtspr(SPRN_PID, pid);
        /*
         * isync not required here because we are HRFID'ing to guest before
         * any guest context access, which is context synchronising.
  static void switch_mmu_to_guest_hpt(struct kvm *kvm, struct kvm_vcpu *vcpu, u64 lpcr)
  {
        u32 lpid;
+       u32 pid;
        int i;
  
        lpid = kvm->arch.lpid;
+       pid = vcpu->arch.pid;
  
        /*
         * See switch_mmu_to_guest_radix. ptesync should not be required here
        isync();
        mtspr(SPRN_LPID, lpid);
        mtspr(SPRN_LPCR, lpcr);
-       mtspr(SPRN_PID, vcpu->arch.pid);
+       mtspr(SPRN_PID, pid);
  
        for (i = 0; i < vcpu->arch.slb_max; i++)
                mtslb(vcpu->arch.slb[i].orige, vcpu->arch.slb[i].origv);
  
  static void switch_mmu_to_host(struct kvm *kvm, u32 pid)
  {
+       u32 lpid = kvm->arch.host_lpid;
+       u64 lpcr = kvm->arch.host_lpcr;
        /*
         * The guest has exited, so guest MMU context is no longer being
         * non-speculatively accessed, but a hwsync is needed before the
        asm volatile("hwsync" ::: "memory");
        isync();
        mtspr(SPRN_PID, pid);
-       mtspr(SPRN_LPID, kvm->arch.host_lpid);
-       mtspr(SPRN_LPCR, kvm->arch.host_lpcr);
+       mtspr(SPRN_LPID, lpid);
+       mtspr(SPRN_LPCR, lpcr);
        /*
         * isync is not required after the switch, because mtmsrd with L=0
         * is performed after this switch, which is context synchronising.
@@@ -120,7 -120,7 +120,7 @@@ static DEFINE_SPINLOCK(kvmppc_uvmem_bit
   *    content is un-encrypted.
   *
   * (c) Normal - The GFN is a normal. The GFN is associated with
 - *    a normal VM. The contents of the GFN is accesible to
 + *    a normal VM. The contents of the GFN is accessible to
   *    the Hypervisor. Its content is never encrypted.
   *
   * States of a VM.
@@@ -361,13 -361,15 +361,15 @@@ static bool kvmppc_gfn_is_uvmem_pfn(uns
  static bool kvmppc_next_nontransitioned_gfn(const struct kvm_memory_slot *memslot,
                struct kvm *kvm, unsigned long *gfn)
  {
-       struct kvmppc_uvmem_slot *p;
+       struct kvmppc_uvmem_slot *p = NULL, *iter;
        bool ret = false;
        unsigned long i;
  
-       list_for_each_entry(p, &kvm->arch.uvmem_pfns, list)
-               if (*gfn >= p->base_pfn && *gfn < p->base_pfn + p->nr_pfns)
+       list_for_each_entry(iter, &kvm->arch.uvmem_pfns, list)
+               if (*gfn >= iter->base_pfn && *gfn < iter->base_pfn + iter->nr_pfns) {
+                       p = iter;
                        break;
+               }
        if (!p)
                return ret;
        /*
  
  #include "book3s_xive.h"
  
- /*
-  * Virtual mode variants of the hcalls for use on radix/radix
-  * with AIL. They require the VCPU's VP to be "pushed"
-  *
-  * We still instantiate them here because we use some of the
-  * generated utility functions as well in this file.
-  */
- #define XIVE_RUNTIME_CHECKS
- #define X_PFX xive_vm_
- #define X_STATIC static
- #define X_STAT_PFX stat_vm_
- #define __x_tima              xive_tima
  #define __x_eoi_page(xd)      ((void __iomem *)((xd)->eoi_mmio))
  #define __x_trig_page(xd)     ((void __iomem *)((xd)->trig_mmio))
- #define __x_writeb    __raw_writeb
- #define __x_readw     __raw_readw
- #define __x_readq     __raw_readq
- #define __x_writeq    __raw_writeq
  
- #include "book3s_xive_template.c"
+ /* Dummy interrupt used when taking interrupts out of a queue in H_CPPR */
+ #define XICS_DUMMY    1
+ static void xive_vm_ack_pending(struct kvmppc_xive_vcpu *xc)
+ {
+       u8 cppr;
+       u16 ack;
+       /*
+        * Ensure any previous store to CPPR is ordered vs.
+        * the subsequent loads from PIPR or ACK.
+        */
+       eieio();
+       /* Perform the acknowledge OS to register cycle. */
+       ack = be16_to_cpu(__raw_readw(xive_tima + TM_SPC_ACK_OS_REG));
+       /* Synchronize subsequent queue accesses */
+       mb();
+       /* XXX Check grouping level */
+       /* Anything ? */
+       if (!((ack >> 8) & TM_QW1_NSR_EO))
+               return;
+       /* Grab CPPR of the most favored pending interrupt */
+       cppr = ack & 0xff;
+       if (cppr < 8)
+               xc->pending |= 1 << cppr;
+       /* Check consistency */
+       if (cppr >= xc->hw_cppr)
+               pr_warn("KVM-XIVE: CPU %d odd ack CPPR, got %d at %d\n",
+                       smp_processor_id(), cppr, xc->hw_cppr);
+       /*
+        * Update our image of the HW CPPR. We don't yet modify
+        * xc->cppr, this will be done as we scan for interrupts
+        * in the queues.
+        */
+       xc->hw_cppr = cppr;
+ }
+ static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
+ {
+       u64 val;
+       if (offset == XIVE_ESB_SET_PQ_10 && xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
+               offset |= XIVE_ESB_LD_ST_MO;
+       val = __raw_readq(__x_eoi_page(xd) + offset);
+ #ifdef __LITTLE_ENDIAN__
+       val >>= 64-8;
+ #endif
+       return (u8)val;
+ }
+ static void xive_vm_source_eoi(u32 hw_irq, struct xive_irq_data *xd)
+ {
+       /* If the XIVE supports the new "store EOI facility, use it */
+       if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
+               __raw_writeq(0, __x_eoi_page(xd) + XIVE_ESB_STORE_EOI);
+       else if (xd->flags & XIVE_IRQ_FLAG_LSI) {
+               /*
+                * For LSIs the HW EOI cycle is used rather than PQ bits,
+                * as they are automatically re-triggred in HW when still
+                * pending.
+                */
+               __raw_readq(__x_eoi_page(xd) + XIVE_ESB_LOAD_EOI);
+       } else {
+               uint64_t eoi_val;
+               /*
+                * Otherwise for EOI, we use the special MMIO that does
+                * a clear of both P and Q and returns the old Q,
+                * except for LSIs where we use the "EOI cycle" special
+                * load.
+                *
+                * This allows us to then do a re-trigger if Q was set
+                * rather than synthetizing an interrupt in software
+                */
+               eoi_val = xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_00);
+               /* Re-trigger if needed */
+               if ((eoi_val & 1) && __x_trig_page(xd))
+                       __raw_writeq(0, __x_trig_page(xd));
+       }
+ }
+ enum {
+       scan_fetch,
+       scan_poll,
+       scan_eoi,
+ };
+ static u32 xive_vm_scan_interrupts(struct kvmppc_xive_vcpu *xc,
+                                      u8 pending, int scan_type)
+ {
+       u32 hirq = 0;
+       u8 prio = 0xff;
+       /* Find highest pending priority */
+       while ((xc->mfrr != 0xff || pending != 0) && hirq == 0) {
+               struct xive_q *q;
+               u32 idx, toggle;
+               __be32 *qpage;
+               /*
+                * If pending is 0 this will return 0xff which is what
+                * we want
+                */
+               prio = ffs(pending) - 1;
+               /* Don't scan past the guest cppr */
+               if (prio >= xc->cppr || prio > 7) {
+                       if (xc->mfrr < xc->cppr) {
+                               prio = xc->mfrr;
+                               hirq = XICS_IPI;
+                       }
+                       break;
+               }
+               /* Grab queue and pointers */
+               q = &xc->queues[prio];
+               idx = q->idx;
+               toggle = q->toggle;
+               /*
+                * Snapshot the queue page. The test further down for EOI
+                * must use the same "copy" that was used by __xive_read_eq
+                * since qpage can be set concurrently and we don't want
+                * to miss an EOI.
+                */
+               qpage = READ_ONCE(q->qpage);
+ skip_ipi:
+               /*
+                * Try to fetch from the queue. Will return 0 for a
+                * non-queueing priority (ie, qpage = 0).
+                */
+               hirq = __xive_read_eq(qpage, q->msk, &idx, &toggle);
+               /*
+                * If this was a signal for an MFFR change done by
+                * H_IPI we skip it. Additionally, if we were fetching
+                * we EOI it now, thus re-enabling reception of a new
+                * such signal.
+                *
+                * We also need to do that if prio is 0 and we had no
+                * page for the queue. In this case, we have non-queued
+                * IPI that needs to be EOId.
+                *
+                * This is safe because if we have another pending MFRR
+                * change that wasn't observed above, the Q bit will have
+                * been set and another occurrence of the IPI will trigger.
+                */
+               if (hirq == XICS_IPI || (prio == 0 && !qpage)) {
+                       if (scan_type == scan_fetch) {
+                               xive_vm_source_eoi(xc->vp_ipi,
+                                                      &xc->vp_ipi_data);
+                               q->idx = idx;
+                               q->toggle = toggle;
+                       }
+                       /* Loop back on same queue with updated idx/toggle */
+                       WARN_ON(hirq && hirq != XICS_IPI);
+                       if (hirq)
+                               goto skip_ipi;
+               }
+               /* If it's the dummy interrupt, continue searching */
+               if (hirq == XICS_DUMMY)
+                       goto skip_ipi;
+               /* Clear the pending bit if the queue is now empty */
+               if (!hirq) {
+                       pending &= ~(1 << prio);
+                       /*
+                        * Check if the queue count needs adjusting due to
+                        * interrupts being moved away.
+                        */
+                       if (atomic_read(&q->pending_count)) {
+                               int p = atomic_xchg(&q->pending_count, 0);
+                               if (p) {
+                                       WARN_ON(p > atomic_read(&q->count));
+                                       atomic_sub(p, &q->count);
+                               }
+                       }
+               }
+               /*
+                * If the most favoured prio we found pending is less
+                * favored (or equal) than a pending IPI, we return
+                * the IPI instead.
+                */
+               if (prio >= xc->mfrr && xc->mfrr < xc->cppr) {
+                       prio = xc->mfrr;
+                       hirq = XICS_IPI;
+                       break;
+               }
+               /* If fetching, update queue pointers */
+               if (scan_type == scan_fetch) {
+                       q->idx = idx;
+                       q->toggle = toggle;
+               }
+       }
+       /* If we are just taking a "peek", do nothing else */
+       if (scan_type == scan_poll)
+               return hirq;
+       /* Update the pending bits */
+       xc->pending = pending;
+       /*
+        * If this is an EOI that's it, no CPPR adjustment done here,
+        * all we needed was cleanup the stale pending bits and check
+        * if there's anything left.
+        */
+       if (scan_type == scan_eoi)
+               return hirq;
+       /*
+        * If we found an interrupt, adjust what the guest CPPR should
+        * be as if we had just fetched that interrupt from HW.
+        *
+        * Note: This can only make xc->cppr smaller as the previous
+        * loop will only exit with hirq != 0 if prio is lower than
+        * the current xc->cppr. Thus we don't need to re-check xc->mfrr
+        * for pending IPIs.
+        */
+       if (hirq)
+               xc->cppr = prio;
+       /*
+        * If it was an IPI the HW CPPR might have been lowered too much
+        * as the HW interrupt we use for IPIs is routed to priority 0.
+        *
+        * We re-sync it here.
+        */
+       if (xc->cppr != xc->hw_cppr) {
+               xc->hw_cppr = xc->cppr;
+               __raw_writeb(xc->cppr, xive_tima + TM_QW1_OS + TM_CPPR);
+       }
+       return hirq;
+ }
+ static unsigned long xive_vm_h_xirr(struct kvm_vcpu *vcpu)
+ {
+       struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+       u8 old_cppr;
+       u32 hirq;
+       pr_devel("H_XIRR\n");
+       xc->stat_vm_h_xirr++;
+       /* First collect pending bits from HW */
+       xive_vm_ack_pending(xc);
+       pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n",
+                xc->pending, xc->hw_cppr, xc->cppr);
+       /* Grab previous CPPR and reverse map it */
+       old_cppr = xive_prio_to_guest(xc->cppr);
+       /* Scan for actual interrupts */
+       hirq = xive_vm_scan_interrupts(xc, xc->pending, scan_fetch);
+       pr_devel(" got hirq=0x%x hw_cppr=%d cppr=%d\n",
+                hirq, xc->hw_cppr, xc->cppr);
+       /* That should never hit */
+       if (hirq & 0xff000000)
+               pr_warn("XIVE: Weird guest interrupt number 0x%08x\n", hirq);
+       /*
+        * XXX We could check if the interrupt is masked here and
+        * filter it. If we chose to do so, we would need to do:
+        *
+        *    if (masked) {
+        *        lock();
+        *        if (masked) {
+        *            old_Q = true;
+        *            hirq = 0;
+        *        }
+        *        unlock();
+        *    }
+        */
+       /* Return interrupt and old CPPR in GPR4 */
+       vcpu->arch.regs.gpr[4] = hirq | (old_cppr << 24);
+       return H_SUCCESS;
+ }
+ static unsigned long xive_vm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
+ {
+       struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+       u8 pending = xc->pending;
+       u32 hirq;
+       pr_devel("H_IPOLL(server=%ld)\n", server);
+       xc->stat_vm_h_ipoll++;
+       /* Grab the target VCPU if not the current one */
+       if (xc->server_num != server) {
+               vcpu = kvmppc_xive_find_server(vcpu->kvm, server);
+               if (!vcpu)
+                       return H_PARAMETER;
+               xc = vcpu->arch.xive_vcpu;
+               /* Scan all priorities */
+               pending = 0xff;
+       } else {
+               /* Grab pending interrupt if any */
+               __be64 qw1 = __raw_readq(xive_tima + TM_QW1_OS);
+               u8 pipr = be64_to_cpu(qw1) & 0xff;
+               if (pipr < 8)
+                       pending |= 1 << pipr;
+       }
+       hirq = xive_vm_scan_interrupts(xc, pending, scan_poll);
+       /* Return interrupt and old CPPR in GPR4 */
+       vcpu->arch.regs.gpr[4] = hirq | (xc->cppr << 24);
+       return H_SUCCESS;
+ }
+ static void xive_vm_push_pending_to_hw(struct kvmppc_xive_vcpu *xc)
+ {
+       u8 pending, prio;
+       pending = xc->pending;
+       if (xc->mfrr != 0xff) {
+               if (xc->mfrr < 8)
+                       pending |= 1 << xc->mfrr;
+               else
+                       pending |= 0x80;
+       }
+       if (!pending)
+               return;
+       prio = ffs(pending) - 1;
+       __raw_writeb(prio, xive_tima + TM_SPC_SET_OS_PENDING);
+ }
+ static void xive_vm_scan_for_rerouted_irqs(struct kvmppc_xive *xive,
+                                              struct kvmppc_xive_vcpu *xc)
+ {
+       unsigned int prio;
+       /* For each priority that is now masked */
+       for (prio = xc->cppr; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
+               struct xive_q *q = &xc->queues[prio];
+               struct kvmppc_xive_irq_state *state;
+               struct kvmppc_xive_src_block *sb;
+               u32 idx, toggle, entry, irq, hw_num;
+               struct xive_irq_data *xd;
+               __be32 *qpage;
+               u16 src;
+               idx = q->idx;
+               toggle = q->toggle;
+               qpage = READ_ONCE(q->qpage);
+               if (!qpage)
+                       continue;
+               /* For each interrupt in the queue */
+               for (;;) {
+                       entry = be32_to_cpup(qpage + idx);
+                       /* No more ? */
+                       if ((entry >> 31) == toggle)
+                               break;
+                       irq = entry & 0x7fffffff;
+                       /* Skip dummies and IPIs */
+                       if (irq == XICS_DUMMY || irq == XICS_IPI)
+                               goto next;
+                       sb = kvmppc_xive_find_source(xive, irq, &src);
+                       if (!sb)
+                               goto next;
+                       state = &sb->irq_state[src];
+                       /* Has it been rerouted ? */
+                       if (xc->server_num == state->act_server)
+                               goto next;
+                       /*
+                        * Allright, it *has* been re-routed, kill it from
+                        * the queue.
+                        */
+                       qpage[idx] = cpu_to_be32((entry & 0x80000000) | XICS_DUMMY);
+                       /* Find the HW interrupt */
+                       kvmppc_xive_select_irq(state, &hw_num, &xd);
+                       /* If it's not an LSI, set PQ to 11 the EOI will force a resend */
+                       if (!(xd->flags & XIVE_IRQ_FLAG_LSI))
+                               xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_11);
+                       /* EOI the source */
+                       xive_vm_source_eoi(hw_num, xd);
+ next:
+                       idx = (idx + 1) & q->msk;
+                       if (idx == 0)
+                               toggle ^= 1;
+               }
+       }
+ }
+ static int xive_vm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+ {
+       struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+       struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
+       u8 old_cppr;
+       pr_devel("H_CPPR(cppr=%ld)\n", cppr);
+       xc->stat_vm_h_cppr++;
+       /* Map CPPR */
+       cppr = xive_prio_from_guest(cppr);
+       /* Remember old and update SW state */
+       old_cppr = xc->cppr;
+       xc->cppr = cppr;
+       /*
+        * Order the above update of xc->cppr with the subsequent
+        * read of xc->mfrr inside push_pending_to_hw()
+        */
+       smp_mb();
+       if (cppr > old_cppr) {
+               /*
+                * We are masking less, we need to look for pending things
+                * to deliver and set VP pending bits accordingly to trigger
+                * a new interrupt otherwise we might miss MFRR changes for
+                * which we have optimized out sending an IPI signal.
+                */
+               xive_vm_push_pending_to_hw(xc);
+       } else {
+               /*
+                * We are masking more, we need to check the queue for any
+                * interrupt that has been routed to another CPU, take
+                * it out (replace it with the dummy) and retrigger it.
+                *
+                * This is necessary since those interrupts may otherwise
+                * never be processed, at least not until this CPU restores
+                * its CPPR.
+                *
+                * This is in theory racy vs. HW adding new interrupts to
+                * the queue. In practice this works because the interesting
+                * cases are when the guest has done a set_xive() to move the
+                * interrupt away, which flushes the xive, followed by the
+                * target CPU doing a H_CPPR. So any new interrupt coming into
+                * the queue must still be routed to us and isn't a source
+                * of concern.
+                */
+               xive_vm_scan_for_rerouted_irqs(xive, xc);
+       }
+       /* Apply new CPPR */
+       xc->hw_cppr = cppr;
+       __raw_writeb(cppr, xive_tima + TM_QW1_OS + TM_CPPR);
+       return H_SUCCESS;
+ }
+ static int xive_vm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+ {
+       struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
+       struct kvmppc_xive_src_block *sb;
+       struct kvmppc_xive_irq_state *state;
+       struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+       struct xive_irq_data *xd;
+       u8 new_cppr = xirr >> 24;
+       u32 irq = xirr & 0x00ffffff, hw_num;
+       u16 src;
+       int rc = 0;
+       pr_devel("H_EOI(xirr=%08lx)\n", xirr);
+       xc->stat_vm_h_eoi++;
+       xc->cppr = xive_prio_from_guest(new_cppr);
+       /*
+        * IPIs are synthetized from MFRR and thus don't need
+        * any special EOI handling. The underlying interrupt
+        * used to signal MFRR changes is EOId when fetched from
+        * the queue.
+        */
+       if (irq == XICS_IPI || irq == 0) {
+               /*
+                * This barrier orders the setting of xc->cppr vs.
+                * subsquent test of xc->mfrr done inside
+                * scan_interrupts and push_pending_to_hw
+                */
+               smp_mb();
+               goto bail;
+       }
+       /* Find interrupt source */
+       sb = kvmppc_xive_find_source(xive, irq, &src);
+       if (!sb) {
+               pr_devel(" source not found !\n");
+               rc = H_PARAMETER;
+               /* Same as above */
+               smp_mb();
+               goto bail;
+       }
+       state = &sb->irq_state[src];
+       kvmppc_xive_select_irq(state, &hw_num, &xd);
+       state->in_eoi = true;
+       /*
+        * This barrier orders both setting of in_eoi above vs,
+        * subsequent test of guest_priority, and the setting
+        * of xc->cppr vs. subsquent test of xc->mfrr done inside
+        * scan_interrupts and push_pending_to_hw
+        */
+       smp_mb();
+ again:
+       if (state->guest_priority == MASKED) {
+               arch_spin_lock(&sb->lock);
+               if (state->guest_priority != MASKED) {
+                       arch_spin_unlock(&sb->lock);
+                       goto again;
+               }
+               pr_devel(" EOI on saved P...\n");
+               /* Clear old_p, that will cause unmask to perform an EOI */
+               state->old_p = false;
+               arch_spin_unlock(&sb->lock);
+       } else {
+               pr_devel(" EOI on source...\n");
+               /* Perform EOI on the source */
+               xive_vm_source_eoi(hw_num, xd);
+               /* If it's an emulated LSI, check level and resend */
+               if (state->lsi && state->asserted)
+                       __raw_writeq(0, __x_trig_page(xd));
+       }
+       /*
+        * This barrier orders the above guest_priority check
+        * and spin_lock/unlock with clearing in_eoi below.
+        *
+        * It also has to be a full mb() as it must ensure
+        * the MMIOs done in source_eoi() are completed before
+        * state->in_eoi is visible.
+        */
+       mb();
+       state->in_eoi = false;
+ bail:
+       /* Re-evaluate pending IRQs and update HW */
+       xive_vm_scan_interrupts(xc, xc->pending, scan_eoi);
+       xive_vm_push_pending_to_hw(xc);
+       pr_devel(" after scan pending=%02x\n", xc->pending);
+       /* Apply new CPPR */
+       xc->hw_cppr = xc->cppr;
+       __raw_writeb(xc->cppr, xive_tima + TM_QW1_OS + TM_CPPR);
+       return rc;
+ }
+ static int xive_vm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+                              unsigned long mfrr)
+ {
+       struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+       pr_devel("H_IPI(server=%08lx,mfrr=%ld)\n", server, mfrr);
+       xc->stat_vm_h_ipi++;
+       /* Find target */
+       vcpu = kvmppc_xive_find_server(vcpu->kvm, server);
+       if (!vcpu)
+               return H_PARAMETER;
+       xc = vcpu->arch.xive_vcpu;
+       /* Locklessly write over MFRR */
+       xc->mfrr = mfrr;
+       /*
+        * The load of xc->cppr below and the subsequent MMIO store
+        * to the IPI must happen after the above mfrr update is
+        * globally visible so that:
+        *
+        * - Synchronize with another CPU doing an H_EOI or a H_CPPR
+        *   updating xc->cppr then reading xc->mfrr.
+        *
+        * - The target of the IPI sees the xc->mfrr update
+        */
+       mb();
+       /* Shoot the IPI if most favored than target cppr */
+       if (mfrr < xc->cppr)
+               __raw_writeq(0, __x_trig_page(&xc->vp_ipi_data));
+       return H_SUCCESS;
+ }
  
  /*
   * We leave a gap of a couple of interrupts in the queue to
@@@ -124,7 -726,7 +726,7 @@@ void kvmppc_xive_push_vcpu(struct kvm_v
                 * interrupt might have fired and be on its way to the
                 * host queue while we mask it, and if we unmask it
                 * early enough (re-cede right away), there is a
 -               * theorical possibility that it fires again, thus
 +               * theoretical possibility that it fires again, thus
                 * landing in the target queue more than once which is
                 * a big no-no.
                 *
@@@ -179,12 -781,13 +781,13 @@@ void kvmppc_xive_pull_vcpu(struct kvm_v
  }
  EXPORT_SYMBOL_GPL(kvmppc_xive_pull_vcpu);
  
void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu)
bool kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu)
  {
        void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr;
+       bool ret = true;
  
        if (!esc_vaddr)
-               return;
+               return ret;
  
        /* we are using XIVE with single escalation */
  
                 * we also don't want to set xive_esc_on to 1 here in
                 * case we race with xive_esc_irq().
                 */
-               vcpu->arch.ceded = 0;
+               ret = false;
                /*
                 * The escalation interrupts are special as we don't EOI them.
                 * There is no need to use the load-after-store ordering offset
                __raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00);
        }
        mb();
+       return ret;
  }
  EXPORT_SYMBOL_GPL(kvmppc_xive_rearm_escalation);
  
@@@ -238,7 -843,7 +843,7 @@@ static irqreturn_t xive_esc_irq(int irq
  
        vcpu->arch.irq_pending = 1;
        smp_mb();
-       if (vcpu->arch.ceded)
+       if (vcpu->arch.ceded || vcpu->arch.nested)
                kvmppc_fast_vcpu_kick(vcpu);
  
        /* Since we have the no-EOI flag, the interrupt is effectively
@@@ -622,7 -1227,7 +1227,7 @@@ static int xive_target_interrupt(struc
  
  /*
   * Targetting rules: In order to avoid losing track of
 - * pending interrupts accross mask and unmask, which would
 + * pending interrupts across mask and unmask, which would
   * allow queue overflows, we implement the following rules:
   *
   *  - Unless it was never enabled (or we run out of capacity)
@@@ -1073,7 -1678,7 +1678,7 @@@ int kvmppc_xive_clr_mapped(struct kvm *
        /*
         * If old_p is set, the interrupt is pending, we switch it to
         * PQ=11. This will force a resend in the host so the interrupt
 -       * isn't lost to whatver host driver may pick it up
 +       * isn't lost to whatever host driver may pick it up
         */
        if (state->old_p)
                xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_11);
@@@ -309,7 -309,7 +309,7 @@@ static int kvmppc_core_vcpu_create_e500
        BUILD_BUG_ON(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0);
        vcpu_e500 = to_e500(vcpu);
  
 -      /* Invalid PIR value -- this LPID dosn't have valid state on any cpu */
 +      /* Invalid PIR value -- this LPID doesn't have valid state on any cpu */
        vcpu->arch.oldpir = 0xffffffff;
  
        err = kvmppc_e500_tlb_init(vcpu_e500);
@@@ -399,7 -399,6 +399,6 @@@ static int __init kvmppc_e500mc_init(vo
         * allocator.
         */
        kvmppc_init_lpid(KVMPPC_NR_LPIDS/threads_per_core);
-       kvmppc_claim_lpid(0); /* host */
  
        r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
        if (r)
@@@ -19,7 -19,6 +19,7 @@@
  #include <linux/module.h>
  #include <linux/irqbypass.h>
  #include <linux/kvm_irqfd.h>
 +#include <linux/of.h>
  #include <asm/cputable.h>
  #include <linux/uaccess.h>
  #include <asm/kvm_ppc.h>
@@@ -426,9 -425,9 +426,9 @@@ int kvmppc_ld(struct kvm_vcpu *vcpu, ul
                return EMULATE_DONE;
        }
  
 -      vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 +      kvm_vcpu_srcu_read_lock(vcpu);
        rc = kvm_read_guest(vcpu->kvm, pte.raddr, ptr, size);
 -      srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 +      kvm_vcpu_srcu_read_unlock(vcpu);
        if (rc)
                return EMULATE_DO_MMIO;
  
        return r;
  }
  
- static unsigned long lpid_inuse[BITS_TO_LONGS(KVMPPC_NR_LPIDS)];
+ static DEFINE_IDA(lpid_inuse);
  static unsigned long nr_lpids;
  
  long kvmppc_alloc_lpid(void)
  {
-       long lpid;
+       int lpid;
  
-       do {
-               lpid = find_first_zero_bit(lpid_inuse, KVMPPC_NR_LPIDS);
-               if (lpid >= nr_lpids) {
+       /* The host LPID must always be 0 (allocation starts at 1) */
+       lpid = ida_alloc_range(&lpid_inuse, 1, nr_lpids - 1, GFP_KERNEL);
+       if (lpid < 0) {
+               if (lpid == -ENOMEM)
+                       pr_err("%s: Out of memory\n", __func__);
+               else
                        pr_err("%s: No LPIDs free\n", __func__);
-                       return -ENOMEM;
-               }
-       } while (test_and_set_bit(lpid, lpid_inuse));
+               return -ENOMEM;
+       }
  
        return lpid;
  }
  EXPORT_SYMBOL_GPL(kvmppc_alloc_lpid);
  
- void kvmppc_claim_lpid(long lpid)
- {
-       set_bit(lpid, lpid_inuse);
- }
- EXPORT_SYMBOL_GPL(kvmppc_claim_lpid);
  void kvmppc_free_lpid(long lpid)
  {
-       clear_bit(lpid, lpid_inuse);
+       ida_free(&lpid_inuse, lpid);
  }
  EXPORT_SYMBOL_GPL(kvmppc_free_lpid);
  
+ /* nr_lpids_param includes the host LPID */
  void kvmppc_init_lpid(unsigned long nr_lpids_param)
  {
-       nr_lpids = min_t(unsigned long, KVMPPC_NR_LPIDS, nr_lpids_param);
-       memset(lpid_inuse, 0, sizeof(lpid_inuse));
+       nr_lpids = nr_lpids_param;
  }
  EXPORT_SYMBOL_GPL(kvmppc_init_lpid);
  
@@@ -111,7 -111,7 +111,7 @@@ static int __meminit vmemmap_populated(
  }
  
  /*
 - * vmemmap virtual address space management does not have a traditonal page
 + * vmemmap virtual address space management does not have a traditional page
   * table to track which virtual struct pages are backed by physical mapping.
   * The virtual to physical mappings are tracked in a simple linked list
   * format. 'vmemmap_list' maintains the entire vmemmap physical mapping at
@@@ -128,7 -128,7 +128,7 @@@ static struct vmemmap_backing *next
  
  /*
   * The same pointer 'next' tracks individual chunks inside the allocated
 - * full page during the boot time and again tracks the freeed nodes during
 + * full page during the boot time and again tracks the freed nodes during
   * runtime. It is racy but it does not happen as they are separated by the
   * boot process. Will create problem if some how we have memory hotplug
   * operation during boot !!
@@@ -372,6 -372,9 +372,9 @@@ void register_page_bootmem_memmap(unsig
  
  #ifdef CONFIG_PPC_BOOK3S_64
  unsigned int mmu_lpid_bits;
+ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ EXPORT_SYMBOL_GPL(mmu_lpid_bits);
+ #endif
  unsigned int mmu_pid_bits;
  
  static bool disable_radix = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT);
  #include <linux/rculist.h>
  #include <linux/sizes.h>
  #include <linux/debugfs.h>
 +#include <linux/of_address.h>
 +#include <linux/of_irq.h>
  
  #include <asm/sections.h>
  #include <asm/io.h>
 -#include <asm/prom.h>
  #include <asm/pci-bridge.h>
  #include <asm/machdep.h>
  #include <asm/msi_bitmap.h>
@@@ -1268,22 -1267,20 +1268,20 @@@ static bool pnv_pci_ioda_iommu_bypass_s
        return false;
  }
  
- static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb,
-                                                    bool real_mode)
+ static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb)
  {
-       return real_mode ? (__be64 __iomem *)(phb->regs_phys + 0x210) :
-               (phb->regs + 0x210);
+       return phb->regs + 0x210;
  }
  
  static void pnv_pci_p7ioc_tce_invalidate(struct iommu_table *tbl,
-               unsigned long index, unsigned long npages, bool rm)
+               unsigned long index, unsigned long npages)
  {
        struct iommu_table_group_link *tgl = list_first_entry_or_null(
                        &tbl->it_group_list, struct iommu_table_group_link,
                        next);
        struct pnv_ioda_pe *pe = container_of(tgl->table_group,
                        struct pnv_ioda_pe, table_group);
-       __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
+       __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb);
        unsigned long start, end, inc;
  
        start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset);
  
          mb(); /* Ensure above stores are visible */
          while (start <= end) {
-               if (rm)
-                       __raw_rm_writeq_be(start, invalidate);
-               else
-                       __raw_writeq_be(start, invalidate);
+               __raw_writeq_be(start, invalidate);
                  start += inc;
          }
  
@@@ -1321,7 -1314,7 +1315,7 @@@ static int pnv_ioda1_tce_build(struct i
                        attrs);
  
        if (!ret)
-               pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
+               pnv_pci_p7ioc_tce_invalidate(tbl, index, npages);
  
        return ret;
  }
  #ifdef CONFIG_IOMMU_API
  /* Common for IODA1 and IODA2 */
  static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index,
-               unsigned long *hpa, enum dma_data_direction *direction,
-               bool realmode)
+               unsigned long *hpa, enum dma_data_direction *direction)
  {
-       return pnv_tce_xchg(tbl, index, hpa, direction, !realmode);
+       return pnv_tce_xchg(tbl, index, hpa, direction);
  }
  #endif
  
@@@ -1341,7 -1333,7 +1334,7 @@@ static void pnv_ioda1_tce_free(struct i
  {
        pnv_tce_free(tbl, index, npages);
  
-       pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
+       pnv_pci_p7ioc_tce_invalidate(tbl, index, npages);
  }
  
  static struct iommu_table_ops pnv_ioda1_iommu_ops = {
  static inline void pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe)
  {
        /* 01xb - invalidate TCEs that match the specified PE# */
-       __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, false);
+       __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb);
        unsigned long val = PHB3_TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF);
  
        mb(); /* Ensure above stores are visible */
        __raw_writeq_be(val, invalidate);
  }
  
- static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
+ static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe,
                                        unsigned shift, unsigned long index,
                                        unsigned long npages)
  {
-       __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
+       __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb);
        unsigned long start, end, inc;
  
        /* We'll invalidate DMA address in PE scope */
        mb();
  
        while (start <= end) {
-               if (rm)
-                       __raw_rm_writeq_be(start, invalidate);
-               else
-                       __raw_writeq_be(start, invalidate);
+               __raw_writeq_be(start, invalidate);
                start += inc;
        }
  }
@@@ -1408,7 -1397,7 +1398,7 @@@ static inline void pnv_pci_ioda2_tce_in
  }
  
  static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
-               unsigned long index, unsigned long npages, bool rm)
+               unsigned long index, unsigned long npages)
  {
        struct iommu_table_group_link *tgl;
  
                unsigned int shift = tbl->it_page_shift;
  
                if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
-                       pnv_pci_phb3_tce_invalidate(pe, rm, shift,
+                       pnv_pci_phb3_tce_invalidate(pe, shift,
                                                    index, npages);
                else
                        opal_pci_tce_kill(phb->opal_id,
@@@ -1438,7 -1427,7 +1428,7 @@@ static int pnv_ioda2_tce_build(struct i
                        attrs);
  
        if (!ret)
-               pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
+               pnv_pci_ioda2_tce_invalidate(tbl, index, npages);
  
        return ret;
  }
@@@ -1448,7 -1437,7 +1438,7 @@@ static void pnv_ioda2_tce_free(struct i
  {
        pnv_tce_free(tbl, index, npages);
  
-       pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
+       pnv_pci_ioda2_tce_invalidate(tbl, index, npages);
  }
  
  static struct iommu_table_ops pnv_ioda2_iommu_ops = {
@@@ -2739,7 -2728,7 +2729,7 @@@ static void pnv_pci_ioda1_release_pe_dm
        if (rc != OPAL_SUCCESS)
                return;
  
-       pnv_pci_p7ioc_tce_invalidate(tbl, tbl->it_offset, tbl->it_size, false);
+       pnv_pci_p7ioc_tce_invalidate(tbl, tbl->it_offset, tbl->it_size);
        if (pe->table_group.group) {
                iommu_group_put(pe->table_group.group);
                WARN_ON(pe->table_group.group);
@@@ -666,8 -666,7 +666,7 @@@ static void pci_dma_bus_setup_pSeries(s
  
  #ifdef CONFIG_IOMMU_API
  static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
-                               long *tce, enum dma_data_direction *direction,
-                               bool realmode)
+                               long *tce, enum dma_data_direction *direction)
  {
        long rc;
        unsigned long ioba = (unsigned long) index << tbl->it_page_shift;
@@@ -1430,7 -1429,7 +1429,7 @@@ static bool enable_ddw(struct pci_dev *
  
                pci->table_group->tables[1] = newtbl;
  
 -              /* Keep default DMA window stuct if removed */
 +              /* Keep default DMA window struct if removed */
                if (default_win_removed) {
                        tbl->it_size = 0;
                        vfree(tbl->it_map);