Merge tag 'kvm-4.9-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 6 Oct 2016 17:49:01 +0000 (10:49 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 6 Oct 2016 17:49:01 +0000 (10:49 -0700)
Pull KVM updates from Radim Krčmář:
 "All architectures:
   - move `make kvmconfig` stubs from x86
   - use 64 bits for debugfs stats

  ARM:
   - Important fixes for not using an in-kernel irqchip
   - handle SError exceptions and present them to guests if appropriate
   - proxying of GICV access at EL2 if guest mappings are unsafe
   - GICv3 on AArch32 on ARMv8
   - preparations for GICv3 save/restore, including ABI docs
   - cleanups and a bit of optimizations

  MIPS:
   - A couple of fixes in preparation for supporting MIPS EVA host
     kernels
   - MIPS SMP host & TLB invalidation fixes

  PPC:
   - Fix the bug which caused guests to falsely report lockups
   - other minor fixes
   - a small optimization

  s390:
   - Lazy enablement of runtime instrumentation
   - up to 255 CPUs for nested guests
   - rework of machine check deliver
   - cleanups and fixes

  x86:
   - IOMMU part of AMD's AVIC for vmexit-less interrupt delivery
   - Hyper-V TSC page
   - per-vcpu tsc_offset in debugfs
   - accelerated INS/OUTS in nVMX
   - cleanups and fixes"

* tag 'kvm-4.9-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (140 commits)
  KVM: MIPS: Drop dubious EntryHi optimisation
  KVM: MIPS: Invalidate TLB by regenerating ASIDs
  KVM: MIPS: Split kernel/user ASID regeneration
  KVM: MIPS: Drop other CPU ASIDs on guest MMU changes
  KVM: arm/arm64: vgic: Don't flush/sync without a working vgic
  KVM: arm64: Require in-kernel irqchip for PMU support
  KVM: PPC: Book3s PR: Allow access to unprivileged MMCR2 register
  KVM: PPC: Book3S PR: Support 64kB page size on POWER8E and POWER8NVL
  KVM: PPC: Book3S: Remove duplicate setting of the B field in tlbie
  KVM: PPC: BookE: Fix a sanity check
  KVM: PPC: Book3S HV: Take out virtual core piggybacking code
  KVM: PPC: Book3S: Treat VTB as a per-subcore register, not per-thread
  ARM: gic-v3: Work around definition of gic_write_bpr1
  KVM: nVMX: Fix the NMI IDT-vectoring handling
  KVM: VMX: Enable MSR-BASED TPR shadow even if APICv is inactive
  KVM: nVMX: Fix reload apic access page warning
  kvmconfig: add virtio-gpu to config fragment
  config: move x86 kvm_guest.config to a common location
  arm64: KVM: Remove duplicating init code for setting VMID
  ARM: KVM: Support vgic-v3
  ...

130 files changed:
Documentation/kernel-parameters.txt
Documentation/virtual/kvm/devices/arm-vgic-its.txt [new file with mode: 0644]
Documentation/virtual/kvm/devices/arm-vgic-v3.txt [new file with mode: 0644]
Documentation/virtual/kvm/devices/arm-vgic.txt
Documentation/virtual/kvm/devices/vcpu.txt
arch/arm/include/asm/arch_gicv3.h
arch/arm/include/asm/cp15.h
arch/arm/include/asm/cputype.h
arch/arm/include/asm/kvm_asm.h
arch/arm/include/asm/kvm_emulate.h
arch/arm/include/asm/kvm_host.h
arch/arm/include/asm/kvm_hyp.h
arch/arm/include/asm/kvm_mmu.h
arch/arm/include/uapi/asm/kvm.h
arch/arm/kvm/Makefile
arch/arm/kvm/arm.c
arch/arm/kvm/coproc.c
arch/arm/kvm/emulate.c
arch/arm/kvm/handle_exit.c
arch/arm/kvm/hyp/Makefile
arch/arm/kvm/hyp/entry.S
arch/arm/kvm/hyp/hyp-entry.S
arch/arm/kvm/hyp/switch.c
arch/arm/kvm/hyp/tlb.c
arch/arm/kvm/mmio.c
arch/arm/kvm/mmu.c
arch/arm64/include/asm/arch_gicv3.h
arch/arm64/include/asm/kvm_arm.h
arch/arm64/include/asm/kvm_asm.h
arch/arm64/include/asm/kvm_emulate.h
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/kvm_hyp.h
arch/arm64/include/asm/kvm_mmu.h
arch/arm64/kvm/Kconfig
arch/arm64/kvm/Makefile
arch/arm64/kvm/emulate.c [deleted file]
arch/arm64/kvm/handle_exit.c
arch/arm64/kvm/hyp/Makefile
arch/arm64/kvm/hyp/debug-sr.c
arch/arm64/kvm/hyp/entry.S
arch/arm64/kvm/hyp/hyp-entry.S
arch/arm64/kvm/hyp/switch.c
arch/arm64/kvm/hyp/tlb.c
arch/arm64/kvm/hyp/vgic-v3-sr.c [deleted file]
arch/arm64/kvm/inject_fault.c
arch/mips/include/asm/kvm_host.h
arch/mips/kvm/emulate.c
arch/mips/kvm/mips.c
arch/mips/kvm/mmu.c
arch/mips/kvm/trap_emul.c
arch/powerpc/include/asm/book3s/64/mmu-hash.h
arch/powerpc/include/asm/io.h
arch/powerpc/include/asm/kvm_asm.h
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_book3s_64.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/include/asm/mmu.h
arch/powerpc/include/asm/opal.h
arch/powerpc/include/asm/pnv-pci.h
arch/powerpc/include/asm/reg.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kvm/Kconfig
arch/powerpc/kvm/Makefile
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s_emulate.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_builtin.c
arch/powerpc/kvm/book3s_hv_rm_xics.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/kvm/book3s_pr.c
arch/powerpc/kvm/book3s_xics.c
arch/powerpc/kvm/book3s_xics.h
arch/powerpc/kvm/booke.c
arch/powerpc/kvm/e500_mmu.c
arch/powerpc/kvm/powerpc.c
arch/powerpc/kvm/trace_hv.h
arch/powerpc/mm/hash_native_64.c
arch/powerpc/mm/hash_utils_64.c
arch/powerpc/platforms/powernv/opal-wrappers.S
arch/powerpc/platforms/powernv/pci-ioda.c
arch/s390/include/asm/kvm_host.h
arch/s390/kernel/asm-offsets.c
arch/s390/kvm/gaccess.c
arch/s390/kvm/guestdbg.c
arch/s390/kvm/intercept.c
arch/s390/kvm/interrupt.c
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/kvm-s390.h
arch/s390/kvm/priv.c
arch/x86/configs/kvm_guest.config [deleted file]
arch/x86/entry/vdso/vclock_gettime.c
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/pvclock.h
arch/x86/kernel/pvclock.c
arch/x86/kvm/Makefile
arch/x86/kvm/cpuid.c
arch/x86/kvm/debugfs.c [new file with mode: 0644]
arch/x86/kvm/hyperv.c
arch/x86/kvm/hyperv.h
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu.c
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
drivers/iommu/amd_iommu.c
drivers/iommu/amd_iommu_init.c
drivers/iommu/amd_iommu_proto.h
drivers/iommu/amd_iommu_types.h
include/kvm/arm_vgic.h
include/linux/amd-iommu.h
include/linux/kvm_host.h
kernel/configs/kvm_guest.config [new file with mode: 0644]
virt/kvm/arm/aarch32.c [new file with mode: 0644]
virt/kvm/arm/arch_timer.c
virt/kvm/arm/hyp/vgic-v2-sr.c
virt/kvm/arm/hyp/vgic-v3-sr.c [new file with mode: 0644]
virt/kvm/arm/pmu.c
virt/kvm/arm/vgic/vgic-init.c
virt/kvm/arm/vgic/vgic-irqfd.c
virt/kvm/arm/vgic/vgic-kvm-device.c
virt/kvm/arm/vgic/vgic-mmio-v3.c
virt/kvm/arm/vgic/vgic-mmio.c
virt/kvm/arm/vgic/vgic-mmio.h
virt/kvm/arm/vgic/vgic-v2.c
virt/kvm/arm/vgic/vgic.c
virt/kvm/arm/vgic/vgic.h
virt/kvm/eventfd.c
virt/kvm/kvm_main.c

index 6fa1d8a..ec8d814 100644 (file)
@@ -460,6 +460,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        driver will print ACPI tables for AMD IOMMU during
                        IOMMU initialization.
 
+       amd_iommu_intr= [HW,X86-64]
+                       Specifies one of the following AMD IOMMU interrupt
+                       remapping modes:
+                       legacy     - Use legacy interrupt remapping mode.
+                       vapic      - Use virtual APIC mode, which allows IOMMU
+                                    to inject interrupts directly into guest.
+                                    This mode requires kvm-amd.avic=1.
+                                    (Default when IOMMU HW support is present.)
+
        amijoy.map=     [HW,JOY] Amiga joystick support
                        Map of devices attached to JOY0DAT and JOY1DAT
                        Format: <a>,<b>
diff --git a/Documentation/virtual/kvm/devices/arm-vgic-its.txt b/Documentation/virtual/kvm/devices/arm-vgic-its.txt
new file mode 100644 (file)
index 0000000..6081a5b
--- /dev/null
@@ -0,0 +1,38 @@
+ARM Virtual Interrupt Translation Service (ITS)
+===============================================
+
+Device types supported:
+  KVM_DEV_TYPE_ARM_VGIC_ITS    ARM Interrupt Translation Service Controller
+
+The ITS allows MSI(-X) interrupts to be injected into guests. This extension is
+optional.  Creating a virtual ITS controller also requires a host GICv3 (see
+arm-vgic-v3.txt), but does not depend on having physical ITS controllers.
+
+There can be multiple ITS controllers per guest, each of them has to have
+a separate, non-overlapping MMIO region.
+
+
+Groups:
+  KVM_DEV_ARM_VGIC_GRP_ADDR
+  Attributes:
+    KVM_VGIC_ITS_ADDR_TYPE (rw, 64-bit)
+      Base address in the guest physical address space of the GICv3 ITS
+      control register frame.
+      This address needs to be 64K aligned and the region covers 128K.
+  Errors:
+    -E2BIG:  Address outside of addressable IPA range
+    -EINVAL: Incorrectly aligned address
+    -EEXIST: Address already configured
+    -EFAULT: Invalid user pointer for attr->addr.
+    -ENODEV: Incorrect attribute or the ITS is not supported.
+
+
+  KVM_DEV_ARM_VGIC_GRP_CTRL
+  Attributes:
+    KVM_DEV_ARM_VGIC_CTRL_INIT
+      request the initialization of the ITS, no additional parameter in
+      kvm_device_attr.addr.
+  Errors:
+    -ENXIO:  ITS not properly configured as required prior to setting
+             this attribute
+    -ENOMEM: Memory shortage when allocating ITS internal data
diff --git a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt
new file mode 100644 (file)
index 0000000..9348b3c
--- /dev/null
@@ -0,0 +1,206 @@
+ARM Virtual Generic Interrupt Controller v3 and later (VGICv3)
+==============================================================
+
+
+Device types supported:
+  KVM_DEV_TYPE_ARM_VGIC_V3     ARM Generic Interrupt Controller v3.0
+
+Only one VGIC instance may be instantiated through this API.  The created VGIC
+will act as the VM interrupt controller, requiring emulated user-space devices
+to inject interrupts to the VGIC instead of directly to CPUs.  It is not
+possible to create both a GICv3 and GICv2 on the same VM.
+
+Creating a guest GICv3 device requires a host GICv3 as well.
+
+
+Groups:
+  KVM_DEV_ARM_VGIC_GRP_ADDR
+  Attributes:
+    KVM_VGIC_V3_ADDR_TYPE_DIST (rw, 64-bit)
+      Base address in the guest physical address space of the GICv3 distributor
+      register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
+      This address needs to be 64K aligned and the region covers 64 KByte.
+
+    KVM_VGIC_V3_ADDR_TYPE_REDIST (rw, 64-bit)
+      Base address in the guest physical address space of the GICv3
+      redistributor register mappings. There are two 64K pages for each
+      VCPU and all of the redistributor pages are contiguous.
+      Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
+      This address needs to be 64K aligned.
+  Errors:
+    -E2BIG:  Address outside of addressable IPA range
+    -EINVAL: Incorrectly aligned address
+    -EEXIST: Address already configured
+    -ENXIO:  The group or attribute is unknown/unsupported for this device
+             or hardware support is missing.
+    -EFAULT: Invalid user pointer for attr->addr.
+
+
+
+  KVM_DEV_ARM_VGIC_GRP_DIST_REGS
+  KVM_DEV_ARM_VGIC_GRP_REDIST_REGS
+  Attributes:
+    The attr field of kvm_device_attr encodes two values:
+    bits:     | 63   ....  32  |  31   ....    0 |
+    values:   |      mpidr     |      offset     |
+
+    All distributor regs are (rw, 32-bit) and kvm_device_attr.addr points to a
+    __u32 value.  64-bit registers must be accessed by separately accessing the
+    lower and higher word.
+
+    Writes to read-only registers are ignored by the kernel.
+
+    KVM_DEV_ARM_VGIC_GRP_DIST_REGS accesses the main distributor registers.
+    KVM_DEV_ARM_VGIC_GRP_REDIST_REGS accesses the redistributor of the CPU
+    specified by the mpidr.
+
+    The offset is relative to the "[Re]Distributor base address" as defined
+    in the GICv3/4 specs.  Getting or setting such a register has the same
+    effect as reading or writing the register on real hardware, except for the
+    following registers: GICD_STATUSR, GICR_STATUSR, GICD_ISPENDR,
+    GICR_ISPENDR0, GICD_ICPENDR, and GICR_ICPENDR0.  These registers behave
+    differently when accessed via this interface compared to their
+    architecturally defined behavior to allow software a full view of the
+    VGIC's internal state.
+
+    The mpidr field is used to specify which
+    redistributor is accessed.  The mpidr is ignored for the distributor.
+
+    The mpidr encoding is based on the affinity information in the
+    architecture defined MPIDR, and the field is encoded as follows:
+      | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 |
+      |    Aff3    |    Aff2    |    Aff1    |    Aff0    |
+
+    Note that distributor fields are not banked, but return the same value
+    regardless of the mpidr used to access the register.
+
+    The GICD_STATUSR and GICR_STATUSR registers are architecturally defined such
+    that a write of a clear bit has no effect, whereas a write with a set bit
+    clears that value.  To allow userspace to freely set the values of these two
+    registers, setting the attributes with the register offsets for these two
+    registers simply sets the non-reserved bits to the value written.
+
+
+    Accesses (reads and writes) to the GICD_ISPENDR register region and
+    GICR_ISPENDR0 registers get/set the value of the latched pending state for
+    the interrupts.
+
+    This is identical to the value returned by a guest read from ISPENDR for an
+    edge triggered interrupt, but may differ for level triggered interrupts.
+    For edge triggered interrupts, once an interrupt becomes pending (whether
+    because of an edge detected on the input line or because of a guest write
+    to ISPENDR) this state is "latched", and only cleared when either the
+    interrupt is activated or when the guest writes to ICPENDR. A level
+    triggered interrupt may be pending either because the level input is held
+    high by a device, or because of a guest write to the ISPENDR register. Only
+    ISPENDR writes are latched; if the device lowers the line level then the
+    interrupt is no longer pending unless the guest also wrote to ISPENDR, and
+    conversely writes to ICPENDR or activations of the interrupt do not clear
+    the pending status if the line level is still being held high.  (These
+    rules are documented in the GICv3 specification descriptions of the ICPENDR
+    and ISPENDR registers.) For a level triggered interrupt the value accessed
+    here is that of the latch which is set by ISPENDR and cleared by ICPENDR or
+    interrupt activation, whereas the value returned by a guest read from
+    ISPENDR is the logical OR of the latch value and the input line level.
+
+    Raw access to the latch state is provided to userspace so that it can save
+    and restore the entire GIC internal state (which is defined by the
+    combination of the current input line level and the latch state, and cannot
+    be deduced from purely the line level and the value of the ISPENDR
+    registers).
+
+    Accesses to GICD_ICPENDR register region and GICR_ICPENDR0 registers have
+    RAZ/WI semantics, meaning that reads always return 0 and writes are always
+    ignored.
+
+  Errors:
+    -ENXIO: Getting or setting this register is not yet supported
+    -EBUSY: One or more VCPUs are running
+
+
+  KVM_DEV_ARM_VGIC_CPU_SYSREGS
+  Attributes:
+    The attr field of kvm_device_attr encodes two values:
+    bits:     | 63      ....       32 | 31  ....  16 | 15  ....  0 |
+    values:   |         mpidr         |      RES     |    instr    |
+
+    The mpidr field encodes the CPU ID based on the affinity information in the
+    architecture defined MPIDR, and the field is encoded as follows:
+      | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 |
+      |    Aff3    |    Aff2    |    Aff1    |    Aff0    |
+
+    The instr field encodes the system register to access based on the fields
+    defined in the A64 instruction set encoding for system register access
+    (RES means the bits are reserved for future use and should be zero):
+
+      | 15 ... 14 | 13 ... 11 | 10 ... 7 | 6 ... 3 | 2 ... 0 |
+      |   Op 0    |    Op1    |    CRn   |   CRm   |   Op2   |
+
+    All system regs accessed through this API are (rw, 64-bit) and
+    kvm_device_attr.addr points to a __u64 value.
+
+    KVM_DEV_ARM_VGIC_CPU_SYSREGS accesses the CPU interface registers for the
+    CPU specified by the mpidr field.
+
+  Errors:
+    -ENXIO: Getting or setting this register is not yet supported
+    -EBUSY: VCPU is running
+    -EINVAL: Invalid mpidr supplied
+
+
+  KVM_DEV_ARM_VGIC_GRP_NR_IRQS
+  Attributes:
+    A value describing the number of interrupts (SGI, PPI and SPI) for
+    this GIC instance, ranging from 64 to 1024, in increments of 32.
+
+    kvm_device_attr.addr points to a __u32 value.
+
+  Errors:
+    -EINVAL: Value set is out of the expected range
+    -EBUSY: Value has already be set.
+
+
+  KVM_DEV_ARM_VGIC_GRP_CTRL
+  Attributes:
+    KVM_DEV_ARM_VGIC_CTRL_INIT
+      request the initialization of the VGIC, no additional parameter in
+      kvm_device_attr.addr.
+  Errors:
+    -ENXIO: VGIC not properly configured as required prior to calling
+     this attribute
+    -ENODEV: no online VCPU
+    -ENOMEM: memory shortage when allocating vgic internal data
+
+
+  KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO
+  Attributes:
+    The attr field of kvm_device_attr encodes the following values:
+    bits:     | 63      ....       32 | 31   ....    10 | 9  ....  0 |
+    values:   |         mpidr         |      info       |   vINTID   |
+
+    The vINTID specifies which set of IRQs is reported on.
+
+    The info field specifies which information userspace wants to get or set
+    using this interface.  Currently we support the following info values:
+
+      VGIC_LEVEL_INFO_LINE_LEVEL:
+       Get/Set the input level of the IRQ line for a set of 32 contiguously
+       numbered interrupts.
+       vINTID must be a multiple of 32.
+
+       kvm_device_attr.addr points to a __u32 value which will contain a
+       bitmap where a set bit means the interrupt level is asserted.
+
+       Bit[n] indicates the status for interrupt vINTID + n.
+
+    SGIs and any interrupt with a higher ID than the number of interrupts
+    supported, will be RAZ/WI.  LPIs are always edge-triggered and are
+    therefore not supported by this interface.
+
+    PPIs are reported per VCPU as specified in the mpidr field, and SPIs are
+    reported with the same value regardless of the mpidr specified.
+
+    The mpidr field encodes the CPU ID based on the affinity information in the
+    architecture defined MPIDR, and the field is encoded as follows:
+      | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 |
+      |    Aff3    |    Aff2    |    Aff1    |    Aff0    |
index 89182f8..76e61c8 100644 (file)
@@ -1,24 +1,19 @@
-ARM Virtual Generic Interrupt Controller (VGIC)
-===============================================
+ARM Virtual Generic Interrupt Controller v2 (VGIC)
+==================================================
 
 Device types supported:
   KVM_DEV_TYPE_ARM_VGIC_V2     ARM Generic Interrupt Controller v2.0
-  KVM_DEV_TYPE_ARM_VGIC_V3     ARM Generic Interrupt Controller v3.0
-  KVM_DEV_TYPE_ARM_VGIC_ITS    ARM Interrupt Translation Service Controller
 
-Only one VGIC instance of the V2/V3 types above may be instantiated through
-either this API or the legacy KVM_CREATE_IRQCHIP api.  The created VGIC will
-act as the VM interrupt controller, requiring emulated user-space devices to
-inject interrupts to the VGIC instead of directly to CPUs.
+Only one VGIC instance may be instantiated through either this API or the
+legacy KVM_CREATE_IRQCHIP API.  The created VGIC will act as the VM interrupt
+controller, requiring emulated user-space devices to inject interrupts to the
+VGIC instead of directly to CPUs.
 
-Creating a guest GICv3 device requires a host GICv3 as well.
-GICv3 implementations with hardware compatibility support allow a guest GICv2
-as well.
+GICv3 implementations with hardware compatibility support allow creating a
+guest GICv2 through this interface.  For information on creating a guest GICv3
+device and guest ITS devices, see arm-vgic-v3.txt.  It is not possible to
+create both a GICv3 and GICv2 device on the same VM.
 
-Creating a virtual ITS controller requires a host GICv3 (but does not depend
-on having physical ITS controllers).
-There can be multiple ITS controllers per guest, each of them has to have
-a separate, non-overlapping MMIO region.
 
 Groups:
   KVM_DEV_ARM_VGIC_GRP_ADDR
@@ -32,26 +27,13 @@ Groups:
       Base address in the guest physical address space of the GIC virtual cpu
       interface register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V2.
       This address needs to be 4K aligned and the region covers 4 KByte.
-
-    KVM_VGIC_V3_ADDR_TYPE_DIST (rw, 64-bit)
-      Base address in the guest physical address space of the GICv3 distributor
-      register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
-      This address needs to be 64K aligned and the region covers 64 KByte.
-
-    KVM_VGIC_V3_ADDR_TYPE_REDIST (rw, 64-bit)
-      Base address in the guest physical address space of the GICv3
-      redistributor register mappings. There are two 64K pages for each
-      VCPU and all of the redistributor pages are contiguous.
-      Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
-      This address needs to be 64K aligned.
-
-    KVM_VGIC_V3_ADDR_TYPE_ITS (rw, 64-bit)
-      Base address in the guest physical address space of the GICv3 ITS
-      control register frame. The ITS allows MSI(-X) interrupts to be
-      injected into guests. This extension is optional. If the kernel
-      does not support the ITS, the call returns -ENODEV.
-      Only valid for KVM_DEV_TYPE_ARM_VGIC_ITS.
-      This address needs to be 64K aligned and the region covers 128K.
+  Errors:
+    -E2BIG:  Address outside of addressable IPA range
+    -EINVAL: Incorrectly aligned address
+    -EEXIST: Address already configured
+    -ENXIO:  The group or attribute is unknown/unsupported for this device
+             or hardware support is missing.
+    -EFAULT: Invalid user pointer for attr->addr.
 
   KVM_DEV_ARM_VGIC_GRP_DIST_REGS
   Attributes:
index c041658..02f5068 100644 (file)
@@ -30,4 +30,6 @@ Returns: -ENODEV: PMUv3 not supported
                  attribute
          -EBUSY: PMUv3 already initialized
 
-Request the initialization of the PMUv3.
+Request the initialization of the PMUv3.  This must be done after creating the
+in-kernel irqchip.  Creating a PMU with a userspace irqchip is currently not
+supported.
index dfe4002..a808829 100644 (file)
@@ -22,9 +22,7 @@
 
 #include <linux/io.h>
 #include <asm/barrier.h>
-
-#define __ACCESS_CP15(CRn, Op1, CRm, Op2)      p15, Op1, %0, CRn, CRm, Op2
-#define __ACCESS_CP15_64(Op1, CRm)             p15, Op1, %Q0, %R0, CRm
+#include <asm/cp15.h>
 
 #define ICC_EOIR1                      __ACCESS_CP15(c12, 0, c12, 1)
 #define ICC_DIR                                __ACCESS_CP15(c12, 0, c11, 1)
 #define ICH_AP1R2                      __AP1Rx(2)
 #define ICH_AP1R3                      __AP1Rx(3)
 
+/* A32-to-A64 mappings used by VGIC save/restore */
+
+#define CPUIF_MAP(a32, a64)                    \
+static inline void write_ ## a64(u32 val)      \
+{                                              \
+       write_sysreg(val, a32);                 \
+}                                              \
+static inline u32 read_ ## a64(void)           \
+{                                              \
+       return read_sysreg(a32);                \
+}                                              \
+
+#define CPUIF_MAP_LO_HI(a32lo, a32hi, a64)     \
+static inline void write_ ## a64(u64 val)      \
+{                                              \
+       write_sysreg(lower_32_bits(val), a32lo);\
+       write_sysreg(upper_32_bits(val), a32hi);\
+}                                              \
+static inline u64 read_ ## a64(void)           \
+{                                              \
+       u64 val = read_sysreg(a32lo);           \
+                                               \
+       val |=  (u64)read_sysreg(a32hi) << 32;  \
+                                               \
+       return val;                             \
+}
+
+CPUIF_MAP(ICH_HCR, ICH_HCR_EL2)
+CPUIF_MAP(ICH_VTR, ICH_VTR_EL2)
+CPUIF_MAP(ICH_MISR, ICH_MISR_EL2)
+CPUIF_MAP(ICH_EISR, ICH_EISR_EL2)
+CPUIF_MAP(ICH_ELSR, ICH_ELSR_EL2)
+CPUIF_MAP(ICH_VMCR, ICH_VMCR_EL2)
+CPUIF_MAP(ICH_AP0R3, ICH_AP0R3_EL2)
+CPUIF_MAP(ICH_AP0R2, ICH_AP0R2_EL2)
+CPUIF_MAP(ICH_AP0R1, ICH_AP0R1_EL2)
+CPUIF_MAP(ICH_AP0R0, ICH_AP0R0_EL2)
+CPUIF_MAP(ICH_AP1R3, ICH_AP1R3_EL2)
+CPUIF_MAP(ICH_AP1R2, ICH_AP1R2_EL2)
+CPUIF_MAP(ICH_AP1R1, ICH_AP1R1_EL2)
+CPUIF_MAP(ICH_AP1R0, ICH_AP1R0_EL2)
+CPUIF_MAP(ICC_HSRE, ICC_SRE_EL2)
+CPUIF_MAP(ICC_SRE, ICC_SRE_EL1)
+
+CPUIF_MAP_LO_HI(ICH_LR15, ICH_LRC15, ICH_LR15_EL2)
+CPUIF_MAP_LO_HI(ICH_LR14, ICH_LRC14, ICH_LR14_EL2)
+CPUIF_MAP_LO_HI(ICH_LR13, ICH_LRC13, ICH_LR13_EL2)
+CPUIF_MAP_LO_HI(ICH_LR12, ICH_LRC12, ICH_LR12_EL2)
+CPUIF_MAP_LO_HI(ICH_LR11, ICH_LRC11, ICH_LR11_EL2)
+CPUIF_MAP_LO_HI(ICH_LR10, ICH_LRC10, ICH_LR10_EL2)
+CPUIF_MAP_LO_HI(ICH_LR9, ICH_LRC9, ICH_LR9_EL2)
+CPUIF_MAP_LO_HI(ICH_LR8, ICH_LRC8, ICH_LR8_EL2)
+CPUIF_MAP_LO_HI(ICH_LR7, ICH_LRC7, ICH_LR7_EL2)
+CPUIF_MAP_LO_HI(ICH_LR6, ICH_LRC6, ICH_LR6_EL2)
+CPUIF_MAP_LO_HI(ICH_LR5, ICH_LRC5, ICH_LR5_EL2)
+CPUIF_MAP_LO_HI(ICH_LR4, ICH_LRC4, ICH_LR4_EL2)
+CPUIF_MAP_LO_HI(ICH_LR3, ICH_LRC3, ICH_LR3_EL2)
+CPUIF_MAP_LO_HI(ICH_LR2, ICH_LRC2, ICH_LR2_EL2)
+CPUIF_MAP_LO_HI(ICH_LR1, ICH_LRC1, ICH_LR1_EL2)
+CPUIF_MAP_LO_HI(ICH_LR0, ICH_LRC0, ICH_LR0_EL2)
+
+#define read_gicreg(r)                 read_##r()
+#define write_gicreg(v, r)             write_##r(v)
+
 /* Low-level accessors */
 
 static inline void gic_write_eoir(u32 irq)
 {
-       asm volatile("mcr " __stringify(ICC_EOIR1) : : "r" (irq));
+       write_sysreg(irq, ICC_EOIR1);
        isb();
 }
 
 static inline void gic_write_dir(u32 val)
 {
-       asm volatile("mcr " __stringify(ICC_DIR) : : "r" (val));
+       write_sysreg(val, ICC_DIR);
        isb();
 }
 
 static inline u32 gic_read_iar(void)
 {
-       u32 irqstat;
+       u32 irqstat = read_sysreg(ICC_IAR1);
 
-       asm volatile("mrc " __stringify(ICC_IAR1) : "=r" (irqstat));
        dsb(sy);
+
        return irqstat;
 }
 
 static inline void gic_write_pmr(u32 val)
 {
-       asm volatile("mcr " __stringify(ICC_PMR) : : "r" (val));
+       write_sysreg(val, ICC_PMR);
 }
 
 static inline void gic_write_ctlr(u32 val)
 {
-       asm volatile("mcr " __stringify(ICC_CTLR) : : "r" (val));
+       write_sysreg(val, ICC_CTLR);
        isb();
 }
 
 static inline void gic_write_grpen1(u32 val)
 {
-       asm volatile("mcr " __stringify(ICC_IGRPEN1) : : "r" (val));
+       write_sysreg(val, ICC_IGRPEN1);
        isb();
 }
 
 static inline void gic_write_sgi1r(u64 val)
 {
-       asm volatile("mcrr " __stringify(ICC_SGI1R) : : "r" (val));
+       write_sysreg(val, ICC_SGI1R);
 }
 
 static inline u32 gic_read_sre(void)
 {
-       u32 val;
-
-       asm volatile("mrc " __stringify(ICC_SRE) : "=r" (val));
-       return val;
+       return read_sysreg(ICC_SRE);
 }
 
 static inline void gic_write_sre(u32 val)
 {
-       asm volatile("mcr " __stringify(ICC_SRE) : : "r" (val));
+       write_sysreg(val, ICC_SRE);
        isb();
 }
 
 static inline void gic_write_bpr1(u32 val)
 {
-       asm volatile("mcr " __stringify(ICC_BPR1) : : "r" (val));
+       write_sysreg(val, ICC_BPR1);
 }
 
 /*
index c3f1152..dbdbce1 100644 (file)
 
 #ifdef CONFIG_CPU_CP15
 
+#define __ACCESS_CP15(CRn, Op1, CRm, Op2)      \
+       "mrc", "mcr", __stringify(p15, Op1, %0, CRn, CRm, Op2), u32
+#define __ACCESS_CP15_64(Op1, CRm)             \
+       "mrrc", "mcrr", __stringify(p15, Op1, %Q0, %R0, CRm), u64
+
+#define __read_sysreg(r, w, c, t) ({                           \
+       t __val;                                                \
+       asm volatile(r " " c : "=r" (__val));                   \
+       __val;                                                  \
+})
+#define read_sysreg(...)               __read_sysreg(__VA_ARGS__)
+
+#define __write_sysreg(v, r, w, c, t)  asm volatile(w " " c : : "r" ((t)(v)))
+#define write_sysreg(v, ...)           __write_sysreg(v, __VA_ARGS__)
+
 extern unsigned long cr_alignment;     /* defined in entry-armv.S */
 
 static inline unsigned long get_cr(void)
index 754f86f..522b5fe 100644 (file)
@@ -55,6 +55,7 @@
 
 #define MPIDR_LEVEL_BITS 8
 #define MPIDR_LEVEL_MASK ((1 << MPIDR_LEVEL_BITS) - 1)
+#define MPIDR_LEVEL_SHIFT(level) (MPIDR_LEVEL_BITS * level)
 
 #define MPIDR_AFFINITY_LEVEL(mpidr, level) \
        ((mpidr >> (MPIDR_LEVEL_BITS * level)) & MPIDR_LEVEL_MASK)
index 58faff5..d7ea6bc 100644 (file)
 
 #include <asm/virt.h>
 
+#define ARM_EXIT_WITH_ABORT_BIT  31
+#define ARM_EXCEPTION_CODE(x)    ((x) & ~(1U << ARM_EXIT_WITH_ABORT_BIT))
+#define ARM_ABORT_PENDING(x)     !!((x) & (1U << ARM_EXIT_WITH_ABORT_BIT))
+
 #define ARM_EXCEPTION_RESET      0
 #define ARM_EXCEPTION_UNDEFINED   1
 #define ARM_EXCEPTION_SOFTWARE    2
@@ -68,6 +72,9 @@ extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 extern void __init_stage2_translation(void);
 
 extern void __kvm_hyp_reset(unsigned long);
+
+extern u64 __vgic_v3_get_ich_vtr_el2(void);
+extern void __vgic_v3_init_lrs(void);
 #endif
 
 #endif /* __ARM_KVM_ASM_H__ */
index ee5328f..9a8a45a 100644 (file)
@@ -40,18 +40,29 @@ static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, u8 reg_num,
        *vcpu_reg(vcpu, reg_num) = val;
 }
 
-bool kvm_condition_valid(struct kvm_vcpu *vcpu);
-void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr);
+bool kvm_condition_valid32(const struct kvm_vcpu *vcpu);
+void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr);
 void kvm_inject_undefined(struct kvm_vcpu *vcpu);
+void kvm_inject_vabt(struct kvm_vcpu *vcpu);
 void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
 void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
 
+static inline bool kvm_condition_valid(const struct kvm_vcpu *vcpu)
+{
+       return kvm_condition_valid32(vcpu);
+}
+
+static inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr)
+{
+       kvm_skip_instr32(vcpu, is_wide_instr);
+}
+
 static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
 {
        vcpu->arch.hcr = HCR_GUEST_MASK;
 }
 
-static inline unsigned long vcpu_get_hcr(struct kvm_vcpu *vcpu)
+static inline unsigned long vcpu_get_hcr(const struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.hcr;
 }
@@ -61,7 +72,7 @@ static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr)
        vcpu->arch.hcr = hcr;
 }
 
-static inline bool vcpu_mode_is_32bit(struct kvm_vcpu *vcpu)
+static inline bool vcpu_mode_is_32bit(const struct kvm_vcpu *vcpu)
 {
        return 1;
 }
@@ -71,9 +82,9 @@ static inline unsigned long *vcpu_pc(struct kvm_vcpu *vcpu)
        return &vcpu->arch.ctxt.gp_regs.usr_regs.ARM_pc;
 }
 
-static inline unsigned long *vcpu_cpsr(struct kvm_vcpu *vcpu)
+static inline unsigned long *vcpu_cpsr(const struct kvm_vcpu *vcpu)
 {
-       return &vcpu->arch.ctxt.gp_regs.usr_regs.ARM_cpsr;
+       return (unsigned long *)&vcpu->arch.ctxt.gp_regs.usr_regs.ARM_cpsr;
 }
 
 static inline void vcpu_set_thumb(struct kvm_vcpu *vcpu)
@@ -93,11 +104,21 @@ static inline bool vcpu_mode_priv(struct kvm_vcpu *vcpu)
        return cpsr_mode > USR_MODE;;
 }
 
-static inline u32 kvm_vcpu_get_hsr(struct kvm_vcpu *vcpu)
+static inline u32 kvm_vcpu_get_hsr(const struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.fault.hsr;
 }
 
+static inline int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu)
+{
+       u32 hsr = kvm_vcpu_get_hsr(vcpu);
+
+       if (hsr & HSR_CV)
+               return (hsr & HSR_COND) >> HSR_COND_SHIFT;
+
+       return -1;
+}
+
 static inline unsigned long kvm_vcpu_get_hfar(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.fault.hxfar;
index de338d9..2d19e02 100644 (file)
 
 #include <kvm/arm_vgic.h>
 
+
+#ifdef CONFIG_ARM_GIC_V3
+#define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS
+#else
 #define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS
+#endif
 
 #define KVM_REQ_VCPU_EXIT      8
 
@@ -183,15 +188,15 @@ struct kvm_vcpu_arch {
 };
 
 struct kvm_vm_stat {
-       u32 remote_tlb_flush;
+       ulong remote_tlb_flush;
 };
 
 struct kvm_vcpu_stat {
-       u32 halt_successful_poll;
-       u32 halt_attempted_poll;
-       u32 halt_poll_invalid;
-       u32 halt_wakeup;
-       u32 hvc_exit_stat;
+       u64 halt_successful_poll;
+       u64 halt_attempted_poll;
+       u64 halt_poll_invalid;
+       u64 halt_wakeup;
+       u64 hvc_exit_stat;
        u64 wfe_exit_stat;
        u64 wfi_exit_stat;
        u64 mmio_exit_user;
index 6eaff28..343135e 100644 (file)
 
 #include <linux/compiler.h>
 #include <linux/kvm_host.h>
+#include <asm/cp15.h>
 #include <asm/kvm_mmu.h>
 #include <asm/vfp.h>
 
 #define __hyp_text __section(.hyp.text) notrace
 
-#define __ACCESS_CP15(CRn, Op1, CRm, Op2)      \
-       "mrc", "mcr", __stringify(p15, Op1, %0, CRn, CRm, Op2), u32
-#define __ACCESS_CP15_64(Op1, CRm)             \
-       "mrrc", "mcrr", __stringify(p15, Op1, %Q0, %R0, CRm), u64
 #define __ACCESS_VFP(CRn)                      \
        "mrc", "mcr", __stringify(p10, 7, %0, CRn, cr0, 0), u32
 
-#define __write_sysreg(v, r, w, c, t)  asm volatile(w " " c : : "r" ((t)(v)))
-#define write_sysreg(v, ...)           __write_sysreg(v, __VA_ARGS__)
-
-#define __read_sysreg(r, w, c, t) ({                           \
-       t __val;                                                \
-       asm volatile(r " " c : "=r" (__val));                   \
-       __val;                                                  \
-})
-#define read_sysreg(...)               __read_sysreg(__VA_ARGS__)
-
 #define write_special(v, r)                                    \
        asm volatile("msr " __stringify(r) ", %0" : : "r" (v))
 #define read_special(r) ({                                     \
@@ -119,6 +106,9 @@ void __vgic_v2_restore_state(struct kvm_vcpu *vcpu);
 void __sysreg_save_state(struct kvm_cpu_context *ctxt);
 void __sysreg_restore_state(struct kvm_cpu_context *ctxt);
 
+void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
+void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
+
 void asmlinkage __vfp_save_state(struct vfp_hard_struct *vfp);
 void asmlinkage __vfp_restore_state(struct vfp_hard_struct *vfp);
 static inline bool __vfp_enabled(void)
index 3bb803d..74a4472 100644 (file)
@@ -63,37 +63,13 @@ void kvm_clear_hyp_idmap(void);
 static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd)
 {
        *pmd = new_pmd;
-       flush_pmd_entry(pmd);
+       dsb(ishst);
 }
 
 static inline void kvm_set_pte(pte_t *pte, pte_t new_pte)
 {
        *pte = new_pte;
-       /*
-        * flush_pmd_entry just takes a void pointer and cleans the necessary
-        * cache entries, so we can reuse the function for ptes.
-        */
-       flush_pmd_entry(pte);
-}
-
-static inline void kvm_clean_pgd(pgd_t *pgd)
-{
-       clean_dcache_area(pgd, PTRS_PER_S2_PGD * sizeof(pgd_t));
-}
-
-static inline void kvm_clean_pmd(pmd_t *pmd)
-{
-       clean_dcache_area(pmd, PTRS_PER_PMD * sizeof(pmd_t));
-}
-
-static inline void kvm_clean_pmd_entry(pmd_t *pmd)
-{
-       clean_pmd_entry(pmd);
-}
-
-static inline void kvm_clean_pte(pte_t *pte)
-{
-       clean_pte_table(pte);
+       dsb(ishst);
 }
 
 static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
index a2b3eb3..b38c10c 100644 (file)
@@ -84,6 +84,13 @@ struct kvm_regs {
 #define KVM_VGIC_V2_DIST_SIZE          0x1000
 #define KVM_VGIC_V2_CPU_SIZE           0x2000
 
+/* Supported VGICv3 address types  */
+#define KVM_VGIC_V3_ADDR_TYPE_DIST     2
+#define KVM_VGIC_V3_ADDR_TYPE_REDIST   3
+
+#define KVM_VGIC_V3_DIST_SIZE          SZ_64K
+#define KVM_VGIC_V3_REDIST_SIZE                (2 * SZ_64K)
+
 #define KVM_ARM_VCPU_POWER_OFF         0 /* CPU is started in OFF state */
 #define KVM_ARM_VCPU_PSCI_0_2          1 /* CPU uses PSCI v0.2 */
 
index 10d77a6..f19842e 100644 (file)
@@ -21,13 +21,16 @@ obj-$(CONFIG_KVM_ARM_HOST) += hyp/
 obj-y += kvm-arm.o init.o interrupts.o
 obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
 obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o
+obj-y += $(KVM)/arm/aarch32.o
 
 obj-y += $(KVM)/arm/vgic/vgic.o
 obj-y += $(KVM)/arm/vgic/vgic-init.o
 obj-y += $(KVM)/arm/vgic/vgic-irqfd.o
 obj-y += $(KVM)/arm/vgic/vgic-v2.o
+obj-y += $(KVM)/arm/vgic/vgic-v3.o
 obj-y += $(KVM)/arm/vgic/vgic-mmio.o
 obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o
+obj-y += $(KVM)/arm/vgic/vgic-mmio-v3.o
 obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o
 obj-y += $(KVM)/irqchip.o
 obj-y += $(KVM)/arm/arch_timer.o
index c94b90d..03e9273 100644 (file)
@@ -144,6 +144,16 @@ out_fail_alloc:
        return ret;
 }
 
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+       return false;
+}
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
 int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 {
        return VM_FAULT_SIGBUS;
@@ -1176,6 +1186,10 @@ static int init_common_resources(void)
                return -ENOMEM;
        }
 
+       /* set size of VMID supported by CPU */
+       kvm_vmid_bits = kvm_get_vmid_bits();
+       kvm_info("%d-bit VMID\n", kvm_vmid_bits);
+
        return 0;
 }
 
@@ -1241,10 +1255,6 @@ static void teardown_hyp_mode(void)
 
 static int init_vhe_mode(void)
 {
-       /* set size of VMID supported by CPU */
-       kvm_vmid_bits = kvm_get_vmid_bits();
-       kvm_info("%d-bit VMID\n", kvm_vmid_bits);
-
        kvm_info("VHE mode initialized successfully\n");
        return 0;
 }
@@ -1328,10 +1338,6 @@ static int init_hyp_mode(void)
                }
        }
 
-       /* set size of VMID supported by CPU */
-       kvm_vmid_bits = kvm_get_vmid_bits();
-       kvm_info("%d-bit VMID\n", kvm_vmid_bits);
-
        kvm_info("Hyp mode initialized successfully\n");
 
        return 0;
index 1bb2b79..3e5e419 100644 (file)
@@ -228,6 +228,35 @@ bool access_vm_reg(struct kvm_vcpu *vcpu,
        return true;
 }
 
+static bool access_gic_sgi(struct kvm_vcpu *vcpu,
+                          const struct coproc_params *p,
+                          const struct coproc_reg *r)
+{
+       u64 reg;
+
+       if (!p->is_write)
+               return read_from_write_only(vcpu, p);
+
+       reg = (u64)*vcpu_reg(vcpu, p->Rt2) << 32;
+       reg |= *vcpu_reg(vcpu, p->Rt1) ;
+
+       vgic_v3_dispatch_sgi(vcpu, reg);
+
+       return true;
+}
+
+static bool access_gic_sre(struct kvm_vcpu *vcpu,
+                          const struct coproc_params *p,
+                          const struct coproc_reg *r)
+{
+       if (p->is_write)
+               return ignore_write(vcpu, p);
+
+       *vcpu_reg(vcpu, p->Rt1) = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre;
+
+       return true;
+}
+
 /*
  * We could trap ID_DFR0 and tell the guest we don't support performance
  * monitoring.  Unfortunately the patch to make the kernel check ID_DFR0 was
@@ -361,10 +390,16 @@ static const struct coproc_reg cp15_regs[] = {
        { CRn(10), CRm( 3), Op1( 0), Op2( 1), is32,
                        access_vm_reg, reset_unknown, c10_AMAIR1},
 
+       /* ICC_SGI1R */
+       { CRm64(12), Op1( 0), is64, access_gic_sgi},
+
        /* VBAR: swapped by interrupt.S. */
        { CRn(12), CRm( 0), Op1( 0), Op2( 0), is32,
                        NULL, reset_val, c12_VBAR, 0x00000000 },
 
+       /* ICC_SRE */
+       { CRn(12), CRm(12), Op1( 0), Op2(5), is32, access_gic_sre },
+
        /* CONTEXTIDR/TPIDRURW/TPIDRURO/TPIDRPRW: swapped by interrupt.S. */
        { CRn(13), CRm( 0), Op1( 0), Op2( 1), is32,
                        access_vm_reg, reset_val, c13_CID, 0x00000000 },
index af93e3f..0064b86 100644 (file)
@@ -161,105 +161,6 @@ unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu)
        }
 }
 
-/*
- * A conditional instruction is allowed to trap, even though it
- * wouldn't be executed.  So let's re-implement the hardware, in
- * software!
- */
-bool kvm_condition_valid(struct kvm_vcpu *vcpu)
-{
-       unsigned long cpsr, cond, insn;
-
-       /*
-        * Exception Code 0 can only happen if we set HCR.TGE to 1, to
-        * catch undefined instructions, and then we won't get past
-        * the arm_exit_handlers test anyway.
-        */
-       BUG_ON(!kvm_vcpu_trap_get_class(vcpu));
-
-       /* Top two bits non-zero?  Unconditional. */
-       if (kvm_vcpu_get_hsr(vcpu) >> 30)
-               return true;
-
-       cpsr = *vcpu_cpsr(vcpu);
-
-       /* Is condition field valid? */
-       if ((kvm_vcpu_get_hsr(vcpu) & HSR_CV) >> HSR_CV_SHIFT)
-               cond = (kvm_vcpu_get_hsr(vcpu) & HSR_COND) >> HSR_COND_SHIFT;
-       else {
-               /* This can happen in Thumb mode: examine IT state. */
-               unsigned long it;
-
-               it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3);
-
-               /* it == 0 => unconditional. */
-               if (it == 0)
-                       return true;
-
-               /* The cond for this insn works out as the top 4 bits. */
-               cond = (it >> 4);
-       }
-
-       /* Shift makes it look like an ARM-mode instruction */
-       insn = cond << 28;
-       return arm_check_condition(insn, cpsr) != ARM_OPCODE_CONDTEST_FAIL;
-}
-
-/**
- * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block
- * @vcpu:      The VCPU pointer
- *
- * When exceptions occur while instructions are executed in Thumb IF-THEN
- * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have
- * to do this little bit of work manually. The fields map like this:
- *
- * IT[7:0] -> CPSR[26:25],CPSR[15:10]
- */
-static void kvm_adjust_itstate(struct kvm_vcpu *vcpu)
-{
-       unsigned long itbits, cond;
-       unsigned long cpsr = *vcpu_cpsr(vcpu);
-       bool is_arm = !(cpsr & PSR_T_BIT);
-
-       BUG_ON(is_arm && (cpsr & PSR_IT_MASK));
-
-       if (!(cpsr & PSR_IT_MASK))
-               return;
-
-       cond = (cpsr & 0xe000) >> 13;
-       itbits = (cpsr & 0x1c00) >> (10 - 2);
-       itbits |= (cpsr & (0x3 << 25)) >> 25;
-
-       /* Perform ITAdvance (see page A-52 in ARM DDI 0406C) */
-       if ((itbits & 0x7) == 0)
-               itbits = cond = 0;
-       else
-               itbits = (itbits << 1) & 0x1f;
-
-       cpsr &= ~PSR_IT_MASK;
-       cpsr |= cond << 13;
-       cpsr |= (itbits & 0x1c) << (10 - 2);
-       cpsr |= (itbits & 0x3) << 25;
-       *vcpu_cpsr(vcpu) = cpsr;
-}
-
-/**
- * kvm_skip_instr - skip a trapped instruction and proceed to the next
- * @vcpu: The vcpu pointer
- */
-void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr)
-{
-       bool is_thumb;
-
-       is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_T_BIT);
-       if (is_thumb && !is_wide_instr)
-               *vcpu_pc(vcpu) += 2;
-       else
-               *vcpu_pc(vcpu) += 4;
-       kvm_adjust_itstate(vcpu);
-}
-
-
 /******************************************************************************
  * Inject exceptions into the guest
  */
@@ -402,3 +303,15 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
 {
        inject_abt(vcpu, true, addr);
 }
+
+/**
+ * kvm_inject_vabt - inject an async abort / SError into the guest
+ * @vcpu: The VCPU to receive the exception
+ *
+ * It is assumed that this code is called from the VCPU thread and that the
+ * VCPU therefore is not currently executing guest code.
+ */
+void kvm_inject_vabt(struct kvm_vcpu *vcpu)
+{
+       vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) | HCR_VA);
+}
index 3f1ef0d..4e40d19 100644 (file)
 
 typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *);
 
-static int handle_svc_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
-       /* SVC called from Hyp mode should never get here */
-       kvm_debug("SVC called from Hyp mode shouldn't go here\n");
-       BUG();
-       return -EINVAL; /* Squash warning */
-}
-
 static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
        int ret;
@@ -59,22 +51,6 @@ static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run)
        return 1;
 }
 
-static int handle_pabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
-       /* The hypervisor should never cause aborts */
-       kvm_err("Prefetch Abort taken from Hyp mode at %#08lx (HSR: %#08x)\n",
-               kvm_vcpu_get_hfar(vcpu), kvm_vcpu_get_hsr(vcpu));
-       return -EFAULT;
-}
-
-static int handle_dabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
-       /* This is either an error in the ws. code or an external abort */
-       kvm_err("Data Abort taken from Hyp mode at %#08lx (HSR: %#08x)\n",
-               kvm_vcpu_get_hfar(vcpu), kvm_vcpu_get_hsr(vcpu));
-       return -EFAULT;
-}
-
 /**
  * kvm_handle_wfx - handle a WFI or WFE instructions trapped in guests
  * @vcpu:      the vcpu pointer
@@ -112,13 +88,10 @@ static exit_handle_fn arm_exit_handlers[] = {
        [HSR_EC_CP14_64]        = kvm_handle_cp14_access,
        [HSR_EC_CP_0_13]        = kvm_handle_cp_0_13_access,
        [HSR_EC_CP10_ID]        = kvm_handle_cp10_id,
-       [HSR_EC_SVC_HYP]        = handle_svc_hyp,
        [HSR_EC_HVC]            = handle_hvc,
        [HSR_EC_SMC]            = handle_smc,
        [HSR_EC_IABT]           = kvm_handle_guest_abort,
-       [HSR_EC_IABT_HYP]       = handle_pabt_hyp,
        [HSR_EC_DABT]           = kvm_handle_guest_abort,
-       [HSR_EC_DABT_HYP]       = handle_dabt_hyp,
 };
 
 static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu)
@@ -144,6 +117,25 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 {
        exit_handle_fn exit_handler;
 
+       if (ARM_ABORT_PENDING(exception_index)) {
+               u8 hsr_ec = kvm_vcpu_trap_get_class(vcpu);
+
+               /*
+                * HVC/SMC already have an adjusted PC, which we need
+                * to correct in order to return to after having
+                * injected the abort.
+                */
+               if (hsr_ec == HSR_EC_HVC || hsr_ec == HSR_EC_SMC) {
+                       u32 adj =  kvm_vcpu_trap_il_is32bit(vcpu) ? 4 : 2;
+                       *vcpu_pc(vcpu) -= adj;
+               }
+
+               kvm_inject_vabt(vcpu);
+               return 1;
+       }
+
+       exception_index = ARM_EXCEPTION_CODE(exception_index);
+
        switch (exception_index) {
        case ARM_EXCEPTION_IRQ:
                return 1;
@@ -160,6 +152,9 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
                exit_handler = kvm_get_exit_handler(vcpu);
 
                return exit_handler(vcpu, run);
+       case ARM_EXCEPTION_DATA_ABORT:
+               kvm_inject_vabt(vcpu);
+               return 1;
        default:
                kvm_pr_unimpl("Unsupported exception type: %d",
                              exception_index);
index 8dfa5f7..3023bb5 100644 (file)
@@ -5,6 +5,7 @@
 KVM=../../../../virt/kvm
 
 obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o
 
 obj-$(CONFIG_KVM_ARM_HOST) += tlb.o
index 21c2388..60783f3 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/linkage.h>
 #include <asm/asm-offsets.h>
 #include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
 
        .arch_extension     virt
 
@@ -63,6 +64,36 @@ ENTRY(__guest_exit)
        ldr     lr, [r0, #4]
 
        mov     r0, r1
+       mrs     r1, SPSR
+       mrs     r2, ELR_hyp
+       mrc     p15, 4, r3, c5, c2, 0   @ HSR
+
+       /*
+        * Force loads and stores to complete before unmasking aborts
+        * and forcing the delivery of the exception. This gives us a
+        * single instruction window, which the handler will try to
+        * match.
+        */
+       dsb     sy
+       cpsie   a
+
+       .global abort_guest_exit_start
+abort_guest_exit_start:
+
+       isb
+
+       .global abort_guest_exit_end
+abort_guest_exit_end:
+
+       /*
+        * If we took an abort, r0[31] will be set, and cmp will set
+        * the N bit in PSTATE.
+        */
+       cmp     r0, #0
+       msrmi   SPSR_cxsf, r1
+       msrmi   ELR_hyp, r2
+       mcrmi   p15, 4, r3, c5, c2, 0   @ HSR
+
        bx      lr
 ENDPROC(__guest_exit)
 
index 7809138..96beb53 100644 (file)
@@ -81,7 +81,6 @@ __kvm_hyp_vector:
        invalid_vector  hyp_undef       ARM_EXCEPTION_UNDEFINED
        invalid_vector  hyp_svc         ARM_EXCEPTION_SOFTWARE
        invalid_vector  hyp_pabt        ARM_EXCEPTION_PREF_ABORT
-       invalid_vector  hyp_dabt        ARM_EXCEPTION_DATA_ABORT
        invalid_vector  hyp_fiq         ARM_EXCEPTION_FIQ
 
 ENTRY(__hyp_do_panic)
@@ -164,6 +163,21 @@ hyp_irq:
        load_vcpu r0                    @ Load VCPU pointer to r0
        b       __guest_exit
 
+hyp_dabt:
+       push    {r0, r1}
+       mrs     r0, ELR_hyp
+       ldr     r1, =abort_guest_exit_start
+THUMB( add     r1, r1, #1)
+       cmp     r0, r1
+       ldrne   r1, =abort_guest_exit_end
+THUMB( addne   r1, r1, #1)
+       cmpne   r0, r1
+       pop     {r0, r1}
+       bne     __hyp_panic
+
+       orr     r0, r0, #(1 << ARM_EXIT_WITH_ABORT_BIT)
+       eret
+
        .ltorg
 
        .popsection
index b13caa9..92678b7 100644 (file)
@@ -14,6 +14,7 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
+#include <linux/jump_label.h>
 
 #include <asm/kvm_asm.h>
 #include <asm/kvm_hyp.h>
@@ -54,6 +55,15 @@ static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu)
 {
        u32 val;
 
+       /*
+        * If we pended a virtual abort, preserve it until it gets
+        * cleared. See B1.9.9 (Virtual Abort exception) for details,
+        * but the crucial bit is the zeroing of HCR.VA in the
+        * pseudocode.
+        */
+       if (vcpu->arch.hcr & HCR_VA)
+               vcpu->arch.hcr = read_sysreg(HCR);
+
        write_sysreg(0, HCR);
        write_sysreg(0, HSTR);
        val = read_sysreg(HDCR);
@@ -74,14 +84,21 @@ static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu)
        write_sysreg(read_sysreg(MIDR), VPIDR);
 }
 
+
 static void __hyp_text __vgic_save_state(struct kvm_vcpu *vcpu)
 {
-       __vgic_v2_save_state(vcpu);
+       if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+               __vgic_v3_save_state(vcpu);
+       else
+               __vgic_v2_save_state(vcpu);
 }
 
 static void __hyp_text __vgic_restore_state(struct kvm_vcpu *vcpu)
 {
-       __vgic_v2_restore_state(vcpu);
+       if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+               __vgic_v3_restore_state(vcpu);
+       else
+               __vgic_v2_restore_state(vcpu);
 }
 
 static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)
@@ -134,7 +151,7 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)
        return true;
 }
 
-static int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
+int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 {
        struct kvm_cpu_context *host_ctxt;
        struct kvm_cpu_context *guest_ctxt;
@@ -191,8 +208,6 @@ again:
        return exit_code;
 }
 
-__alias(__guest_run) int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
-
 static const char * const __hyp_panic_string[] = {
        [ARM_EXCEPTION_RESET]      = "\nHYP panic: RST   PC:%08x CPSR:%08x",
        [ARM_EXCEPTION_UNDEFINED]  = "\nHYP panic: UNDEF PC:%08x CPSR:%08x",
index a263600..7296528 100644 (file)
@@ -34,7 +34,7 @@
  * As v7 does not support flushing per IPA, just nuke the whole TLB
  * instead, ignoring the ipa value.
  */
-static void __hyp_text __tlb_flush_vmid(struct kvm *kvm)
+void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm)
 {
        dsb(ishst);
 
@@ -50,21 +50,14 @@ static void __hyp_text __tlb_flush_vmid(struct kvm *kvm)
        write_sysreg(0, VTTBR);
 }
 
-__alias(__tlb_flush_vmid) void __kvm_tlb_flush_vmid(struct kvm *kvm);
-
-static void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
+void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 {
-       __tlb_flush_vmid(kvm);
+       __kvm_tlb_flush_vmid(kvm);
 }
 
-__alias(__tlb_flush_vmid_ipa) void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm,
-                                                           phys_addr_t ipa);
-
-static void __hyp_text __tlb_flush_vm_context(void)
+void __hyp_text __kvm_flush_vm_context(void)
 {
        write_sysreg(0, TLBIALLNSNHIS);
        write_sysreg(0, ICIALLUIS);
        dsb(ish);
 }
-
-__alias(__tlb_flush_vm_context) void __kvm_flush_vm_context(void);
index 10f80a6..b6e715f 100644 (file)
@@ -126,12 +126,6 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len)
        int access_size;
        bool sign_extend;
 
-       if (kvm_vcpu_dabt_isextabt(vcpu)) {
-               /* cache operation on I/O addr, tell guest unsupported */
-               kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
-               return 1;
-       }
-
        if (kvm_vcpu_dabt_iss1tw(vcpu)) {
                /* page table accesses IO mem: tell guest to fix its TTBR */
                kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
index e9a5c0e..a5265ed 100644 (file)
@@ -744,7 +744,6 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
        if (!pgd)
                return -ENOMEM;
 
-       kvm_clean_pgd(pgd);
        kvm->arch.pgd = pgd;
        return 0;
 }
@@ -936,7 +935,6 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
                if (!cache)
                        return 0; /* ignore calls from kvm_set_spte_hva */
                pte = mmu_memory_cache_alloc(cache);
-               kvm_clean_pte(pte);
                pmd_populate_kernel(NULL, pmd, pte);
                get_page(virt_to_page(pmd));
        }
@@ -1434,6 +1432,11 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
        int ret, idx;
 
        is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
+       if (unlikely(!is_iabt && kvm_vcpu_dabt_isextabt(vcpu))) {
+               kvm_inject_vabt(vcpu);
+               return 1;
+       }
+
        fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
 
        trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
index fc2a0cb..f8ae6d6 100644 (file)
 #include <linux/stringify.h>
 #include <asm/barrier.h>
 
+#define read_gicreg(r)                                                 \
+       ({                                                              \
+               u64 reg;                                                \
+               asm volatile("mrs_s %0, " __stringify(r) : "=r" (reg)); \
+               reg;                                                    \
+       })
+
+#define write_gicreg(v,r)                                              \
+       do {                                                            \
+               u64 __val = (v);                                        \
+               asm volatile("msr_s " __stringify(r) ", %0" : : "r" (__val));\
+       } while (0)
+
 /*
  * Low-level accessors
  *
index 4b5c977..2a2752b 100644 (file)
@@ -50,7 +50,7 @@
 #define HCR_BSU                (3 << 10)
 #define HCR_BSU_IS     (UL(1) << 10)
 #define HCR_FB         (UL(1) << 9)
-#define HCR_VA         (UL(1) << 8)
+#define HCR_VSE                (UL(1) << 8)
 #define HCR_VI         (UL(1) << 7)
 #define HCR_VF         (UL(1) << 6)
 #define HCR_AMO                (UL(1) << 5)
@@ -80,7 +80,7 @@
 #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
                         HCR_TVM | HCR_BSU_IS | HCR_FB | HCR_TAC | \
                         HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW)
-#define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF)
+#define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
 #define HCR_INT_OVERRIDE   (HCR_FMO | HCR_IMO)
 #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)
 
index 7561f63..18f7465 100644 (file)
 
 #include <asm/virt.h>
 
+#define ARM_EXIT_WITH_SERROR_BIT  31
+#define ARM_EXCEPTION_CODE(x)    ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT))
+#define ARM_SERROR_PENDING(x)    !!((x) & (1U << ARM_EXIT_WITH_SERROR_BIT))
+
 #define ARM_EXCEPTION_IRQ        0
-#define ARM_EXCEPTION_TRAP       1
+#define ARM_EXCEPTION_EL1_SERROR  1
+#define ARM_EXCEPTION_TRAP       2
 /* The hyp-stub will return this for any kvm_call_hyp() call */
-#define ARM_EXCEPTION_HYP_GONE   2
+#define ARM_EXCEPTION_HYP_GONE   3
 
 #define KVM_ARM64_DEBUG_DIRTY_SHIFT    0
 #define KVM_ARM64_DEBUG_DIRTY          (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT)
index 4cdeae3..fd9d5fd 100644 (file)
@@ -38,6 +38,7 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu);
 void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr);
 
 void kvm_inject_undefined(struct kvm_vcpu *vcpu);
+void kvm_inject_vabt(struct kvm_vcpu *vcpu);
 void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
 void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
 
@@ -147,6 +148,16 @@ static inline u32 kvm_vcpu_get_hsr(const struct kvm_vcpu *vcpu)
        return vcpu->arch.fault.esr_el2;
 }
 
+static inline int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu)
+{
+       u32 esr = kvm_vcpu_get_hsr(vcpu);
+
+       if (esr & ESR_ELx_CV)
+               return (esr & ESR_ELx_COND_MASK) >> ESR_ELx_COND_SHIFT;
+
+       return -1;
+}
+
 static inline unsigned long kvm_vcpu_get_hfar(const struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.fault.far_el2;
index 3eda975..bd94e67 100644 (file)
@@ -290,15 +290,15 @@ struct kvm_vcpu_arch {
 #endif
 
 struct kvm_vm_stat {
-       u32 remote_tlb_flush;
+       ulong remote_tlb_flush;
 };
 
 struct kvm_vcpu_stat {
-       u32 halt_successful_poll;
-       u32 halt_attempted_poll;
-       u32 halt_poll_invalid;
-       u32 halt_wakeup;
-       u32 hvc_exit_stat;
+       u64 halt_successful_poll;
+       u64 halt_attempted_poll;
+       u64 halt_poll_invalid;
+       u64 halt_wakeup;
+       u64 hvc_exit_stat;
        u64 wfe_exit_stat;
        u64 wfi_exit_stat;
        u64 mmio_exit_user;
index cff5105..b18e852 100644 (file)
@@ -123,6 +123,7 @@ typeof(orig) * __hyp_text fname(void)                                       \
 
 void __vgic_v2_save_state(struct kvm_vcpu *vcpu);
 void __vgic_v2_restore_state(struct kvm_vcpu *vcpu);
+int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu);
 
 void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
 void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
index dff1098..a79b969 100644 (file)
@@ -162,12 +162,6 @@ void kvm_clear_hyp_idmap(void);
 #define        kvm_set_pte(ptep, pte)          set_pte(ptep, pte)
 #define        kvm_set_pmd(pmdp, pmd)          set_pmd(pmdp, pmd)
 
-static inline void kvm_clean_pgd(pgd_t *pgd) {}
-static inline void kvm_clean_pmd(pmd_t *pmd) {}
-static inline void kvm_clean_pmd_entry(pmd_t *pmd) {}
-static inline void kvm_clean_pte(pte_t *pte) {}
-static inline void kvm_clean_pte_entry(pte_t *pte) {}
-
 static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
 {
        pte_val(pte) |= PTE_S2_RDWR;
index 9c9edc9..6eaf12c 100644 (file)
@@ -16,7 +16,7 @@ menuconfig VIRTUALIZATION
 
 if VIRTUALIZATION
 
-config KVM_ARM_VGIC_V3
+config KVM_ARM_VGIC_V3_ITS
        bool
 
 config KVM
@@ -34,7 +34,7 @@ config KVM
        select KVM_VFIO
        select HAVE_KVM_EVENTFD
        select HAVE_KVM_IRQFD
-       select KVM_ARM_VGIC_V3
+       select KVM_ARM_VGIC_V3_ITS
        select KVM_ARM_PMU if HW_PERF_EVENTS
        select HAVE_KVM_MSI
        select HAVE_KVM_IRQCHIP
index 695eb3c..d50a82a 100644 (file)
@@ -16,9 +16,10 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/e
 kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/arm.o $(ARM)/mmu.o $(ARM)/mmio.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o
 
-kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
+kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o
 kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
 kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/aarch32.o
 
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o
diff --git a/arch/arm64/kvm/emulate.c b/arch/arm64/kvm/emulate.c
deleted file mode 100644 (file)
index f87d8fb..0000000
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * (not much of an) Emulation layer for 32bit guests.
- *
- * Copyright (C) 2012,2013 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * based on arch/arm/kvm/emulate.c
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/kvm_host.h>
-#include <asm/esr.h>
-#include <asm/kvm_emulate.h>
-
-/*
- * stolen from arch/arm/kernel/opcodes.c
- *
- * condition code lookup table
- * index into the table is test code: EQ, NE, ... LT, GT, AL, NV
- *
- * bit position in short is condition code: NZCV
- */
-static const unsigned short cc_map[16] = {
-       0xF0F0,                 /* EQ == Z set            */
-       0x0F0F,                 /* NE                     */
-       0xCCCC,                 /* CS == C set            */
-       0x3333,                 /* CC                     */
-       0xFF00,                 /* MI == N set            */
-       0x00FF,                 /* PL                     */
-       0xAAAA,                 /* VS == V set            */
-       0x5555,                 /* VC                     */
-       0x0C0C,                 /* HI == C set && Z clear */
-       0xF3F3,                 /* LS == C clear || Z set */
-       0xAA55,                 /* GE == (N==V)           */
-       0x55AA,                 /* LT == (N!=V)           */
-       0x0A05,                 /* GT == (!Z && (N==V))   */
-       0xF5FA,                 /* LE == (Z || (N!=V))    */
-       0xFFFF,                 /* AL always              */
-       0                       /* NV                     */
-};
-
-static int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu)
-{
-       u32 esr = kvm_vcpu_get_hsr(vcpu);
-
-       if (esr & ESR_ELx_CV)
-               return (esr & ESR_ELx_COND_MASK) >> ESR_ELx_COND_SHIFT;
-
-       return -1;
-}
-
-/*
- * Check if a trapped instruction should have been executed or not.
- */
-bool kvm_condition_valid32(const struct kvm_vcpu *vcpu)
-{
-       unsigned long cpsr;
-       u32 cpsr_cond;
-       int cond;
-
-       /* Top two bits non-zero?  Unconditional. */
-       if (kvm_vcpu_get_hsr(vcpu) >> 30)
-               return true;
-
-       /* Is condition field valid? */
-       cond = kvm_vcpu_get_condition(vcpu);
-       if (cond == 0xE)
-               return true;
-
-       cpsr = *vcpu_cpsr(vcpu);
-
-       if (cond < 0) {
-               /* This can happen in Thumb mode: examine IT state. */
-               unsigned long it;
-
-               it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3);
-
-               /* it == 0 => unconditional. */
-               if (it == 0)
-                       return true;
-
-               /* The cond for this insn works out as the top 4 bits. */
-               cond = (it >> 4);
-       }
-
-       cpsr_cond = cpsr >> 28;
-
-       if (!((cc_map[cond] >> cpsr_cond) & 1))
-               return false;
-
-       return true;
-}
-
-/**
- * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block
- * @vcpu:      The VCPU pointer
- *
- * When exceptions occur while instructions are executed in Thumb IF-THEN
- * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have
- * to do this little bit of work manually. The fields map like this:
- *
- * IT[7:0] -> CPSR[26:25],CPSR[15:10]
- */
-static void kvm_adjust_itstate(struct kvm_vcpu *vcpu)
-{
-       unsigned long itbits, cond;
-       unsigned long cpsr = *vcpu_cpsr(vcpu);
-       bool is_arm = !(cpsr & COMPAT_PSR_T_BIT);
-
-       BUG_ON(is_arm && (cpsr & COMPAT_PSR_IT_MASK));
-
-       if (!(cpsr & COMPAT_PSR_IT_MASK))
-               return;
-
-       cond = (cpsr & 0xe000) >> 13;
-       itbits = (cpsr & 0x1c00) >> (10 - 2);
-       itbits |= (cpsr & (0x3 << 25)) >> 25;
-
-       /* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */
-       if ((itbits & 0x7) == 0)
-               itbits = cond = 0;
-       else
-               itbits = (itbits << 1) & 0x1f;
-
-       cpsr &= ~COMPAT_PSR_IT_MASK;
-       cpsr |= cond << 13;
-       cpsr |= (itbits & 0x1c) << (10 - 2);
-       cpsr |= (itbits & 0x3) << 25;
-       *vcpu_cpsr(vcpu) = cpsr;
-}
-
-/**
- * kvm_skip_instr - skip a trapped instruction and proceed to the next
- * @vcpu: The vcpu pointer
- */
-void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr)
-{
-       bool is_thumb;
-
-       is_thumb = !!(*vcpu_cpsr(vcpu) & COMPAT_PSR_T_BIT);
-       if (is_thumb && !is_wide_instr)
-               *vcpu_pc(vcpu) += 2;
-       else
-               *vcpu_pc(vcpu) += 4;
-       kvm_adjust_itstate(vcpu);
-}
index fa96fe2..a204adf 100644 (file)
@@ -170,9 +170,32 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 {
        exit_handle_fn exit_handler;
 
+       if (ARM_SERROR_PENDING(exception_index)) {
+               u8 hsr_ec = ESR_ELx_EC(kvm_vcpu_get_hsr(vcpu));
+
+               /*
+                * HVC/SMC already have an adjusted PC, which we need
+                * to correct in order to return to after having
+                * injected the SError.
+                */
+               if (hsr_ec == ESR_ELx_EC_HVC32 || hsr_ec == ESR_ELx_EC_HVC64 ||
+                   hsr_ec == ESR_ELx_EC_SMC32 || hsr_ec == ESR_ELx_EC_SMC64) {
+                       u32 adj =  kvm_vcpu_trap_il_is32bit(vcpu) ? 4 : 2;
+                       *vcpu_pc(vcpu) -= adj;
+               }
+
+               kvm_inject_vabt(vcpu);
+               return 1;
+       }
+
+       exception_index = ARM_EXCEPTION_CODE(exception_index);
+
        switch (exception_index) {
        case ARM_EXCEPTION_IRQ:
                return 1;
+       case ARM_EXCEPTION_EL1_SERROR:
+               kvm_inject_vabt(vcpu);
+               return 1;
        case ARM_EXCEPTION_TRAP:
                /*
                 * See ARM ARM B1.14.1: "Hyp traps on instructions
index 0c85feb..aaf42ae 100644 (file)
@@ -5,9 +5,9 @@
 KVM=../../../../virt/kvm
 
 obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o
 
-obj-$(CONFIG_KVM_ARM_HOST) += vgic-v3-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += entry.o
index 33342a7..4ba5c90 100644 (file)
@@ -131,9 +131,7 @@ void __hyp_text __debug_cond_restore_host_state(struct kvm_vcpu *vcpu)
                vcpu->arch.debug_flags &= ~KVM_ARM64_DEBUG_DIRTY;
 }
 
-static u32 __hyp_text __debug_read_mdcr_el2(void)
+u32 __hyp_text __kvm_get_mdcr_el2(void)
 {
        return read_sysreg(mdcr_el2);
 }
-
-__alias(__debug_read_mdcr_el2) u32 __kvm_get_mdcr_el2(void);
index ce9e5e5..12ee62d 100644 (file)
  */
 ENTRY(__guest_enter)
        // x0: vcpu
-       // x1: host/guest context
-       // x2-x18: clobbered by macros
+       // x1: host context
+       // x2-x17: clobbered by macros
+       // x18: guest context
 
        // Store the host regs
        save_callee_saved_regs x1
 
-       // Preserve vcpu & host_ctxt for use at exit time
-       stp     x0, x1, [sp, #-16]!
+       // Store the host_ctxt for use at exit time
+       str     x1, [sp, #-16]!
 
-       add     x1, x0, #VCPU_CONTEXT
+       add     x18, x0, #VCPU_CONTEXT
 
-       // Prepare x0-x1 for later restore by pushing them onto the stack
-       ldp     x2, x3, [x1, #CPU_XREG_OFFSET(0)]
-       stp     x2, x3, [sp, #-16]!
+       // Restore guest regs x0-x17
+       ldp     x0, x1,   [x18, #CPU_XREG_OFFSET(0)]
+       ldp     x2, x3,   [x18, #CPU_XREG_OFFSET(2)]
+       ldp     x4, x5,   [x18, #CPU_XREG_OFFSET(4)]
+       ldp     x6, x7,   [x18, #CPU_XREG_OFFSET(6)]
+       ldp     x8, x9,   [x18, #CPU_XREG_OFFSET(8)]
+       ldp     x10, x11, [x18, #CPU_XREG_OFFSET(10)]
+       ldp     x12, x13, [x18, #CPU_XREG_OFFSET(12)]
+       ldp     x14, x15, [x18, #CPU_XREG_OFFSET(14)]
+       ldp     x16, x17, [x18, #CPU_XREG_OFFSET(16)]
 
-       // x2-x18
-       ldp     x2, x3,   [x1, #CPU_XREG_OFFSET(2)]
-       ldp     x4, x5,   [x1, #CPU_XREG_OFFSET(4)]
-       ldp     x6, x7,   [x1, #CPU_XREG_OFFSET(6)]
-       ldp     x8, x9,   [x1, #CPU_XREG_OFFSET(8)]
-       ldp     x10, x11, [x1, #CPU_XREG_OFFSET(10)]
-       ldp     x12, x13, [x1, #CPU_XREG_OFFSET(12)]
-       ldp     x14, x15, [x1, #CPU_XREG_OFFSET(14)]
-       ldp     x16, x17, [x1, #CPU_XREG_OFFSET(16)]
-       ldr     x18,      [x1, #CPU_XREG_OFFSET(18)]
-
-       // x19-x29, lr
-       restore_callee_saved_regs x1
-
-       // Last bits of the 64bit state
-       ldp     x0, x1, [sp], #16
+       // Restore guest regs x19-x29, lr
+       restore_callee_saved_regs x18
+
+       // Restore guest reg x18
+       ldr     x18,      [x18, #CPU_XREG_OFFSET(18)]
 
        // Do not touch any register after this!
        eret
 ENDPROC(__guest_enter)
 
 ENTRY(__guest_exit)
-       // x0: vcpu
-       // x1: return code
-       // x2-x3: free
-       // x4-x29,lr: vcpu regs
-       // vcpu x0-x3 on the stack
+       // x0: return code
+       // x1: vcpu
+       // x2-x29,lr: vcpu regs
+       // vcpu x0-x1 on the stack
 
-       add     x2, x0, #VCPU_CONTEXT
+       add     x1, x1, #VCPU_CONTEXT
 
-       stp     x4, x5,   [x2, #CPU_XREG_OFFSET(4)]
-       stp     x6, x7,   [x2, #CPU_XREG_OFFSET(6)]
-       stp     x8, x9,   [x2, #CPU_XREG_OFFSET(8)]
-       stp     x10, x11, [x2, #CPU_XREG_OFFSET(10)]
-       stp     x12, x13, [x2, #CPU_XREG_OFFSET(12)]
-       stp     x14, x15, [x2, #CPU_XREG_OFFSET(14)]
-       stp     x16, x17, [x2, #CPU_XREG_OFFSET(16)]
-       str     x18,      [x2, #CPU_XREG_OFFSET(18)]
+       ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
 
-       ldp     x6, x7, [sp], #16       // x2, x3
-       ldp     x4, x5, [sp], #16       // x0, x1
+       // Store the guest regs x2 and x3
+       stp     x2, x3,   [x1, #CPU_XREG_OFFSET(2)]
 
-       stp     x4, x5, [x2, #CPU_XREG_OFFSET(0)]
-       stp     x6, x7, [x2, #CPU_XREG_OFFSET(2)]
+       // Retrieve the guest regs x0-x1 from the stack
+       ldp     x2, x3, [sp], #16       // x0, x1
+
+       // Store the guest regs x0-x1 and x4-x18
+       stp     x2, x3,   [x1, #CPU_XREG_OFFSET(0)]
+       stp     x4, x5,   [x1, #CPU_XREG_OFFSET(4)]
+       stp     x6, x7,   [x1, #CPU_XREG_OFFSET(6)]
+       stp     x8, x9,   [x1, #CPU_XREG_OFFSET(8)]
+       stp     x10, x11, [x1, #CPU_XREG_OFFSET(10)]
+       stp     x12, x13, [x1, #CPU_XREG_OFFSET(12)]
+       stp     x14, x15, [x1, #CPU_XREG_OFFSET(14)]
+       stp     x16, x17, [x1, #CPU_XREG_OFFSET(16)]
+       str     x18,      [x1, #CPU_XREG_OFFSET(18)]
+
+       // Store the guest regs x19-x29, lr
+       save_callee_saved_regs x1
 
-       save_callee_saved_regs x2
+       // Restore the host_ctxt from the stack
+       ldr     x2, [sp], #16
 
-       // Restore vcpu & host_ctxt from the stack
-       // (preserving return code in x1)
-       ldp     x0, x2, [sp], #16
        // Now restore the host regs
        restore_callee_saved_regs x2
 
-       mov     x0, x1
-       ret
+       // If we have a pending asynchronous abort, now is the
+       // time to find out. From your VAXorcist book, page 666:
+       // "Threaten me not, oh Evil one!  For I speak with
+       // the power of DEC, and I command thee to show thyself!"
+       mrs     x2, elr_el2
+       mrs     x3, esr_el2
+       mrs     x4, spsr_el2
+       mov     x5, x0
+
+       dsb     sy              // Synchronize against in-flight ld/st
+       msr     daifclr, #4     // Unmask aborts
+
+       // This is our single instruction exception window. A pending
+       // SError is guaranteed to occur at the earliest when we unmask
+       // it, and at the latest just after the ISB.
+       .global abort_guest_exit_start
+abort_guest_exit_start:
+
+       isb
+
+       .global abort_guest_exit_end
+abort_guest_exit_end:
+
+       // If the exception took place, restore the EL1 exception
+       // context so that we can report some information.
+       // Merge the exception code with the SError pending bit.
+       tbz     x0, #ARM_EXIT_WITH_SERROR_BIT, 1f
+       msr     elr_el2, x2
+       msr     esr_el2, x3
+       msr     spsr_el2, x4
+       orr     x0, x0, x5
+1:     ret
 ENDPROC(__guest_exit)
 
 ENTRY(__fpsimd_guest_restore)
+       stp     x2, x3, [sp, #-16]!
        stp     x4, lr, [sp, #-16]!
 
 alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
index f6d9694..4e92399 100644 (file)
        .text
        .pushsection    .hyp.text, "ax"
 
-.macro save_x0_to_x3
-       stp     x0, x1, [sp, #-16]!
-       stp     x2, x3, [sp, #-16]!
-.endm
-
-.macro restore_x0_to_x3
-       ldp     x2, x3, [sp], #16
-       ldp     x0, x1, [sp], #16
-.endm
-
 .macro do_el2_call
        /*
         * Shuffle the parameters before calling the function
@@ -79,23 +69,23 @@ ENTRY(__kvm_hyp_teardown)
 ENDPROC(__kvm_hyp_teardown)
        
 el1_sync:                              // Guest trapped into EL2
-       save_x0_to_x3
+       stp     x0, x1, [sp, #-16]!
 
 alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
        mrs     x1, esr_el2
 alternative_else
        mrs     x1, esr_el1
 alternative_endif
-       lsr     x2, x1, #ESR_ELx_EC_SHIFT
+       lsr     x0, x1, #ESR_ELx_EC_SHIFT
 
-       cmp     x2, #ESR_ELx_EC_HVC64
+       cmp     x0, #ESR_ELx_EC_HVC64
        b.ne    el1_trap
 
-       mrs     x3, vttbr_el2           // If vttbr is valid, the 64bit guest
-       cbnz    x3, el1_trap            // called HVC
+       mrs     x1, vttbr_el2           // If vttbr is valid, the 64bit guest
+       cbnz    x1, el1_trap            // called HVC
 
        /* Here, we're pretty sure the host called HVC. */
-       restore_x0_to_x3
+       ldp     x0, x1, [sp], #16
 
        cmp     x0, #HVC_GET_VECTORS
        b.ne    1f
@@ -113,24 +103,51 @@ alternative_endif
 
 el1_trap:
        /*
-        * x1: ESR
-        * x2: ESR_EC
+        * x0: ESR_EC
         */
 
        /* Guest accessed VFP/SIMD registers, save host, restore Guest */
-       cmp     x2, #ESR_ELx_EC_FP_ASIMD
+       cmp     x0, #ESR_ELx_EC_FP_ASIMD
        b.eq    __fpsimd_guest_restore
 
-       mrs     x0, tpidr_el2
-       mov     x1, #ARM_EXCEPTION_TRAP
+       mrs     x1, tpidr_el2
+       mov     x0, #ARM_EXCEPTION_TRAP
        b       __guest_exit
 
 el1_irq:
-       save_x0_to_x3
-       mrs     x0, tpidr_el2
-       mov     x1, #ARM_EXCEPTION_IRQ
+       stp     x0, x1, [sp, #-16]!
+       mrs     x1, tpidr_el2
+       mov     x0, #ARM_EXCEPTION_IRQ
+       b       __guest_exit
+
+el1_error:
+       stp     x0, x1, [sp, #-16]!
+       mrs     x1, tpidr_el2
+       mov     x0, #ARM_EXCEPTION_EL1_SERROR
        b       __guest_exit
 
+el2_error:
+       /*
+        * Only two possibilities:
+        * 1) Either we come from the exit path, having just unmasked
+        *    PSTATE.A: change the return code to an EL2 fault, and
+        *    carry on, as we're already in a sane state to handle it.
+        * 2) Or we come from anywhere else, and that's a bug: we panic.
+        *
+        * For (1), x0 contains the original return code and x1 doesn't
+        * contain anything meaningful at that stage. We can reuse them
+        * as temp registers.
+        * For (2), who cares?
+        */
+       mrs     x0, elr_el2
+       adr     x1, abort_guest_exit_start
+       cmp     x0, x1
+       adr     x1, abort_guest_exit_end
+       ccmp    x0, x1, #4, ne
+       b.ne    __hyp_panic
+       mov     x0, #(1 << ARM_EXIT_WITH_SERROR_BIT)
+       eret
+
 ENTRY(__hyp_do_panic)
        mov     lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
                      PSR_MODE_EL1h)
@@ -155,11 +172,9 @@ ENDPROC(\label)
        invalid_vector  el2h_sync_invalid
        invalid_vector  el2h_irq_invalid
        invalid_vector  el2h_fiq_invalid
-       invalid_vector  el2h_error_invalid
        invalid_vector  el1_sync_invalid
        invalid_vector  el1_irq_invalid
        invalid_vector  el1_fiq_invalid
-       invalid_vector  el1_error_invalid
 
        .ltorg
 
@@ -174,15 +189,15 @@ ENTRY(__kvm_hyp_vector)
        ventry  el2h_sync_invalid               // Synchronous EL2h
        ventry  el2h_irq_invalid                // IRQ EL2h
        ventry  el2h_fiq_invalid                // FIQ EL2h
-       ventry  el2h_error_invalid              // Error EL2h
+       ventry  el2_error                       // Error EL2h
 
        ventry  el1_sync                        // Synchronous 64-bit EL1
        ventry  el1_irq                         // IRQ 64-bit EL1
        ventry  el1_fiq_invalid                 // FIQ 64-bit EL1
-       ventry  el1_error_invalid               // Error 64-bit EL1
+       ventry  el1_error                       // Error 64-bit EL1
 
        ventry  el1_sync                        // Synchronous 32-bit EL1
        ventry  el1_irq                         // IRQ 32-bit EL1
        ventry  el1_fiq_invalid                 // FIQ 32-bit EL1
-       ventry  el1_error_invalid               // Error 32-bit EL1
+       ventry  el1_error                       // Error 32-bit EL1
 ENDPROC(__kvm_hyp_vector)
index 5a84b45..83037cd 100644 (file)
  */
 
 #include <linux/types.h>
+#include <linux/jump_label.h>
+
 #include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
 #include <asm/kvm_hyp.h>
 
 static bool __hyp_text __fpsimd_enabled_nvhe(void)
@@ -109,6 +112,15 @@ static hyp_alternate_select(__deactivate_traps_arch,
 
 static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu)
 {
+       /*
+        * If we pended a virtual abort, preserve it until it gets
+        * cleared. See D1.14.3 (Virtual Interrupts) for details, but
+        * the crucial bit is "On taking a vSError interrupt,
+        * HCR_EL2.VSE is cleared to 0."
+        */
+       if (vcpu->arch.hcr_el2 & HCR_VSE)
+               vcpu->arch.hcr_el2 = read_sysreg(hcr_el2);
+
        __deactivate_traps_arch()();
        write_sysreg(0, hstr_el2);
        write_sysreg(read_sysreg(mdcr_el2) & MDCR_EL2_HPMN_MASK, mdcr_el2);
@@ -126,17 +138,13 @@ static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu)
        write_sysreg(0, vttbr_el2);
 }
 
-static hyp_alternate_select(__vgic_call_save_state,
-                           __vgic_v2_save_state, __vgic_v3_save_state,
-                           ARM64_HAS_SYSREG_GIC_CPUIF);
-
-static hyp_alternate_select(__vgic_call_restore_state,
-                           __vgic_v2_restore_state, __vgic_v3_restore_state,
-                           ARM64_HAS_SYSREG_GIC_CPUIF);
-
 static void __hyp_text __vgic_save_state(struct kvm_vcpu *vcpu)
 {
-       __vgic_call_save_state()(vcpu);
+       if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+               __vgic_v3_save_state(vcpu);
+       else
+               __vgic_v2_save_state(vcpu);
+
        write_sysreg(read_sysreg(hcr_el2) & ~HCR_INT_OVERRIDE, hcr_el2);
 }
 
@@ -149,7 +157,10 @@ static void __hyp_text __vgic_restore_state(struct kvm_vcpu *vcpu)
        val |= vcpu->arch.irq_lines;
        write_sysreg(val, hcr_el2);
 
-       __vgic_call_restore_state()(vcpu);
+       if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+               __vgic_v3_restore_state(vcpu);
+       else
+               __vgic_v2_restore_state(vcpu);
 }
 
 static bool __hyp_text __true_value(void)
@@ -232,7 +243,22 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)
        return true;
 }
 
-static int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
+static void __hyp_text __skip_instr(struct kvm_vcpu *vcpu)
+{
+       *vcpu_pc(vcpu) = read_sysreg_el2(elr);
+
+       if (vcpu_mode_is_32bit(vcpu)) {
+               vcpu->arch.ctxt.gp_regs.regs.pstate = read_sysreg_el2(spsr);
+               kvm_skip_instr32(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+               write_sysreg_el2(vcpu->arch.ctxt.gp_regs.regs.pstate, spsr);
+       } else {
+               *vcpu_pc(vcpu) += 4;
+       }
+
+       write_sysreg_el2(*vcpu_pc(vcpu), elr);
+}
+
+int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 {
        struct kvm_cpu_context *host_ctxt;
        struct kvm_cpu_context *guest_ctxt;
@@ -267,9 +293,43 @@ again:
        exit_code = __guest_enter(vcpu, host_ctxt);
        /* And we're baaack! */
 
+       /*
+        * We're using the raw exception code in order to only process
+        * the trap if no SError is pending. We will come back to the
+        * same PC once the SError has been injected, and replay the
+        * trapping instruction.
+        */
        if (exit_code == ARM_EXCEPTION_TRAP && !__populate_fault_info(vcpu))
                goto again;
 
+       if (static_branch_unlikely(&vgic_v2_cpuif_trap) &&
+           exit_code == ARM_EXCEPTION_TRAP) {
+               bool valid;
+
+               valid = kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_DABT_LOW &&
+                       kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT &&
+                       kvm_vcpu_dabt_isvalid(vcpu) &&
+                       !kvm_vcpu_dabt_isextabt(vcpu) &&
+                       !kvm_vcpu_dabt_iss1tw(vcpu);
+
+               if (valid) {
+                       int ret = __vgic_v2_perform_cpuif_access(vcpu);
+
+                       if (ret == 1) {
+                               __skip_instr(vcpu);
+                               goto again;
+                       }
+
+                       if (ret == -1) {
+                               /* Promote an illegal access to an SError */
+                               __skip_instr(vcpu);
+                               exit_code = ARM_EXCEPTION_EL1_SERROR;
+                       }
+
+                       /* 0 falls through to be handler out of EL2 */
+               }
+       }
+
        fp_enabled = __fpsimd_enabled();
 
        __sysreg_save_guest_state(guest_ctxt);
@@ -293,8 +353,6 @@ again:
        return exit_code;
 }
 
-__alias(__guest_run) int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
-
 static const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n";
 
 static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par)
index be8177c..9cc0ea7 100644 (file)
@@ -17,7 +17,7 @@
 
 #include <asm/kvm_hyp.h>
 
-static void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
+void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 {
        dsb(ishst);
 
@@ -48,10 +48,7 @@ static void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
        write_sysreg(0, vttbr_el2);
 }
 
-__alias(__tlb_flush_vmid_ipa) void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm,
-                                                           phys_addr_t ipa);
-
-static void __hyp_text __tlb_flush_vmid(struct kvm *kvm)
+void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm)
 {
        dsb(ishst);
 
@@ -67,14 +64,10 @@ static void __hyp_text __tlb_flush_vmid(struct kvm *kvm)
        write_sysreg(0, vttbr_el2);
 }
 
-__alias(__tlb_flush_vmid) void __kvm_tlb_flush_vmid(struct kvm *kvm);
-
-static void __hyp_text __tlb_flush_vm_context(void)
+void __hyp_text __kvm_flush_vm_context(void)
 {
        dsb(ishst);
        asm volatile("tlbi alle1is      \n"
                     "ic ialluis          ": : );
        dsb(ish);
 }
-
-__alias(__tlb_flush_vm_context) void __kvm_flush_vm_context(void);
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
deleted file mode 100644 (file)
index 5f8f80b..0000000
+++ /dev/null
@@ -1,343 +0,0 @@
-/*
- * Copyright (C) 2012-2015 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/compiler.h>
-#include <linux/irqchip/arm-gic-v3.h>
-#include <linux/kvm_host.h>
-
-#include <asm/kvm_hyp.h>
-
-#define vtr_to_max_lr_idx(v)           ((v) & 0xf)
-#define vtr_to_nr_pri_bits(v)          (((u32)(v) >> 29) + 1)
-
-#define read_gicreg(r)                                                 \
-       ({                                                              \
-               u64 reg;                                                \
-               asm volatile("mrs_s %0, " __stringify(r) : "=r" (reg)); \
-               reg;                                                    \
-       })
-
-#define write_gicreg(v,r)                                              \
-       do {                                                            \
-               u64 __val = (v);                                        \
-               asm volatile("msr_s " __stringify(r) ", %0" : : "r" (__val));\
-       } while (0)
-
-static u64 __hyp_text __gic_v3_get_lr(unsigned int lr)
-{
-       switch (lr & 0xf) {
-       case 0:
-               return read_gicreg(ICH_LR0_EL2);
-       case 1:
-               return read_gicreg(ICH_LR1_EL2);
-       case 2:
-               return read_gicreg(ICH_LR2_EL2);
-       case 3:
-               return read_gicreg(ICH_LR3_EL2);
-       case 4:
-               return read_gicreg(ICH_LR4_EL2);
-       case 5:
-               return read_gicreg(ICH_LR5_EL2);
-       case 6:
-               return read_gicreg(ICH_LR6_EL2);
-       case 7:
-               return read_gicreg(ICH_LR7_EL2);
-       case 8:
-               return read_gicreg(ICH_LR8_EL2);
-       case 9:
-               return read_gicreg(ICH_LR9_EL2);
-       case 10:
-               return read_gicreg(ICH_LR10_EL2);
-       case 11:
-               return read_gicreg(ICH_LR11_EL2);
-       case 12:
-               return read_gicreg(ICH_LR12_EL2);
-       case 13:
-               return read_gicreg(ICH_LR13_EL2);
-       case 14:
-               return read_gicreg(ICH_LR14_EL2);
-       case 15:
-               return read_gicreg(ICH_LR15_EL2);
-       }
-
-       unreachable();
-}
-
-static void __hyp_text __gic_v3_set_lr(u64 val, int lr)
-{
-       switch (lr & 0xf) {
-       case 0:
-               write_gicreg(val, ICH_LR0_EL2);
-               break;
-       case 1:
-               write_gicreg(val, ICH_LR1_EL2);
-               break;
-       case 2:
-               write_gicreg(val, ICH_LR2_EL2);
-               break;
-       case 3:
-               write_gicreg(val, ICH_LR3_EL2);
-               break;
-       case 4:
-               write_gicreg(val, ICH_LR4_EL2);
-               break;
-       case 5:
-               write_gicreg(val, ICH_LR5_EL2);
-               break;
-       case 6:
-               write_gicreg(val, ICH_LR6_EL2);
-               break;
-       case 7:
-               write_gicreg(val, ICH_LR7_EL2);
-               break;
-       case 8:
-               write_gicreg(val, ICH_LR8_EL2);
-               break;
-       case 9:
-               write_gicreg(val, ICH_LR9_EL2);
-               break;
-       case 10:
-               write_gicreg(val, ICH_LR10_EL2);
-               break;
-       case 11:
-               write_gicreg(val, ICH_LR11_EL2);
-               break;
-       case 12:
-               write_gicreg(val, ICH_LR12_EL2);
-               break;
-       case 13:
-               write_gicreg(val, ICH_LR13_EL2);
-               break;
-       case 14:
-               write_gicreg(val, ICH_LR14_EL2);
-               break;
-       case 15:
-               write_gicreg(val, ICH_LR15_EL2);
-               break;
-       }
-}
-
-static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, int nr_lr)
-{
-       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-       int i;
-       bool expect_mi;
-
-       expect_mi = !!(cpu_if->vgic_hcr & ICH_HCR_UIE);
-
-       for (i = 0; i < nr_lr; i++) {
-               if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i)))
-                               continue;
-
-               expect_mi |= (!(cpu_if->vgic_lr[i] & ICH_LR_HW) &&
-                             (cpu_if->vgic_lr[i] & ICH_LR_EOI));
-       }
-
-       if (expect_mi) {
-               cpu_if->vgic_misr  = read_gicreg(ICH_MISR_EL2);
-
-               if (cpu_if->vgic_misr & ICH_MISR_EOI)
-                       cpu_if->vgic_eisr = read_gicreg(ICH_EISR_EL2);
-               else
-                       cpu_if->vgic_eisr = 0;
-       } else {
-               cpu_if->vgic_misr = 0;
-               cpu_if->vgic_eisr = 0;
-       }
-}
-
-void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-       u64 val;
-
-       /*
-        * Make sure stores to the GIC via the memory mapped interface
-        * are now visible to the system register interface.
-        */
-       if (!cpu_if->vgic_sre)
-               dsb(st);
-
-       cpu_if->vgic_vmcr  = read_gicreg(ICH_VMCR_EL2);
-
-       if (vcpu->arch.vgic_cpu.live_lrs) {
-               int i;
-               u32 max_lr_idx, nr_pri_bits;
-
-               cpu_if->vgic_elrsr = read_gicreg(ICH_ELSR_EL2);
-
-               write_gicreg(0, ICH_HCR_EL2);
-               val = read_gicreg(ICH_VTR_EL2);
-               max_lr_idx = vtr_to_max_lr_idx(val);
-               nr_pri_bits = vtr_to_nr_pri_bits(val);
-
-               save_maint_int_state(vcpu, max_lr_idx + 1);
-
-               for (i = 0; i <= max_lr_idx; i++) {
-                       if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i)))
-                               continue;
-
-                       if (cpu_if->vgic_elrsr & (1 << i))
-                               cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;
-                       else
-                               cpu_if->vgic_lr[i] = __gic_v3_get_lr(i);
-
-                       __gic_v3_set_lr(0, i);
-               }
-
-               switch (nr_pri_bits) {
-               case 7:
-                       cpu_if->vgic_ap0r[3] = read_gicreg(ICH_AP0R3_EL2);
-                       cpu_if->vgic_ap0r[2] = read_gicreg(ICH_AP0R2_EL2);
-               case 6:
-                       cpu_if->vgic_ap0r[1] = read_gicreg(ICH_AP0R1_EL2);
-               default:
-                       cpu_if->vgic_ap0r[0] = read_gicreg(ICH_AP0R0_EL2);
-               }
-
-               switch (nr_pri_bits) {
-               case 7:
-                       cpu_if->vgic_ap1r[3] = read_gicreg(ICH_AP1R3_EL2);
-                       cpu_if->vgic_ap1r[2] = read_gicreg(ICH_AP1R2_EL2);
-               case 6:
-                       cpu_if->vgic_ap1r[1] = read_gicreg(ICH_AP1R1_EL2);
-               default:
-                       cpu_if->vgic_ap1r[0] = read_gicreg(ICH_AP1R0_EL2);
-               }
-
-               vcpu->arch.vgic_cpu.live_lrs = 0;
-       } else {
-               cpu_if->vgic_misr  = 0;
-               cpu_if->vgic_eisr  = 0;
-               cpu_if->vgic_elrsr = 0xffff;
-               cpu_if->vgic_ap0r[0] = 0;
-               cpu_if->vgic_ap0r[1] = 0;
-               cpu_if->vgic_ap0r[2] = 0;
-               cpu_if->vgic_ap0r[3] = 0;
-               cpu_if->vgic_ap1r[0] = 0;
-               cpu_if->vgic_ap1r[1] = 0;
-               cpu_if->vgic_ap1r[2] = 0;
-               cpu_if->vgic_ap1r[3] = 0;
-       }
-
-       val = read_gicreg(ICC_SRE_EL2);
-       write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
-
-       if (!cpu_if->vgic_sre) {
-               /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
-               isb();
-               write_gicreg(1, ICC_SRE_EL1);
-       }
-}
-
-void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-       u64 val;
-       u32 max_lr_idx, nr_pri_bits;
-       u16 live_lrs = 0;
-       int i;
-
-       /*
-        * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a
-        * Group0 interrupt (as generated in GICv2 mode) to be
-        * delivered as a FIQ to the guest, with potentially fatal
-        * consequences. So we must make sure that ICC_SRE_EL1 has
-        * been actually programmed with the value we want before
-        * starting to mess with the rest of the GIC.
-        */
-       if (!cpu_if->vgic_sre) {
-               write_gicreg(0, ICC_SRE_EL1);
-               isb();
-       }
-
-       val = read_gicreg(ICH_VTR_EL2);
-       max_lr_idx = vtr_to_max_lr_idx(val);
-       nr_pri_bits = vtr_to_nr_pri_bits(val);
-
-       for (i = 0; i <= max_lr_idx; i++) {
-               if (cpu_if->vgic_lr[i] & ICH_LR_STATE)
-                       live_lrs |= (1 << i);
-       }
-
-       write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
-
-       if (live_lrs) {
-               write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
-
-               switch (nr_pri_bits) {
-               case 7:
-                       write_gicreg(cpu_if->vgic_ap0r[3], ICH_AP0R3_EL2);
-                       write_gicreg(cpu_if->vgic_ap0r[2], ICH_AP0R2_EL2);
-               case 6:
-                       write_gicreg(cpu_if->vgic_ap0r[1], ICH_AP0R1_EL2);
-               default:
-                       write_gicreg(cpu_if->vgic_ap0r[0], ICH_AP0R0_EL2);
-               }
-
-               switch (nr_pri_bits) {
-               case 7:
-                       write_gicreg(cpu_if->vgic_ap1r[3], ICH_AP1R3_EL2);
-                       write_gicreg(cpu_if->vgic_ap1r[2], ICH_AP1R2_EL2);
-               case 6:
-                       write_gicreg(cpu_if->vgic_ap1r[1], ICH_AP1R1_EL2);
-               default:
-                       write_gicreg(cpu_if->vgic_ap1r[0], ICH_AP1R0_EL2);
-               }
-
-               for (i = 0; i <= max_lr_idx; i++) {
-                       if (!(live_lrs & (1 << i)))
-                               continue;
-
-                       __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
-               }
-       }
-
-       /*
-        * Ensures that the above will have reached the
-        * (re)distributors. This ensure the guest will read the
-        * correct values from the memory-mapped interface.
-        */
-       if (!cpu_if->vgic_sre) {
-               isb();
-               dsb(sy);
-       }
-       vcpu->arch.vgic_cpu.live_lrs = live_lrs;
-
-       /*
-        * Prevent the guest from touching the GIC system registers if
-        * SRE isn't enabled for GICv3 emulation.
-        */
-       write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
-                    ICC_SRE_EL2);
-}
-
-void __hyp_text __vgic_v3_init_lrs(void)
-{
-       int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2));
-       int i;
-
-       for (i = 0; i <= max_lr_idx; i++)
-               __gic_v3_set_lr(0, i);
-}
-
-static u64 __hyp_text __vgic_v3_read_ich_vtr_el2(void)
-{
-       return read_gicreg(ICH_VTR_EL2);
-}
-
-__alias(__vgic_v3_read_ich_vtr_el2) u64 __vgic_v3_get_ich_vtr_el2(void);
index 898c0e6..da6a8cf 100644 (file)
@@ -231,3 +231,15 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu)
        else
                inject_undef64(vcpu);
 }
+
+/**
+ * kvm_inject_vabt - inject an async abort / SError into the guest
+ * @vcpu: The VCPU to receive the exception
+ *
+ * It is assumed that this code is called from the VCPU thread and that the
+ * VCPU therefore is not currently executing guest code.
+ */
+void kvm_inject_vabt(struct kvm_vcpu *vcpu)
+{
+       vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) | HCR_VSE);
+}
index b54bcad..07f58cf 100644 (file)
 #define KVM_INVALID_INST               0xdeadbeef
 #define KVM_INVALID_ADDR               0xdeadbeef
 
+/*
+ * EVA has overlapping user & kernel address spaces, so user VAs may be >
+ * PAGE_OFFSET. For this reason we can't use the default KVM_HVA_ERR_BAD of
+ * PAGE_OFFSET.
+ */
+
+#define KVM_HVA_ERR_BAD                        (-1UL)
+#define KVM_HVA_ERR_RO_BAD             (-2UL)
+
+static inline bool kvm_is_error_hva(unsigned long addr)
+{
+       return IS_ERR_VALUE(addr);
+}
+
 extern atomic_t kvm_mips_instance;
 
 struct kvm_vm_stat {
-       u32 remote_tlb_flush;
+       ulong remote_tlb_flush;
 };
 
 struct kvm_vcpu_stat {
-       u32 wait_exits;
-       u32 cache_exits;
-       u32 signal_exits;
-       u32 int_exits;
-       u32 cop_unusable_exits;
-       u32 tlbmod_exits;
-       u32 tlbmiss_ld_exits;
-       u32 tlbmiss_st_exits;
-       u32 addrerr_st_exits;
-       u32 addrerr_ld_exits;
-       u32 syscall_exits;
-       u32 resvd_inst_exits;
-       u32 break_inst_exits;
-       u32 trap_inst_exits;
-       u32 msa_fpe_exits;
-       u32 fpe_exits;
-       u32 msa_disabled_exits;
-       u32 flush_dcache_exits;
-       u32 halt_successful_poll;
-       u32 halt_attempted_poll;
-       u32 halt_poll_invalid;
-       u32 halt_wakeup;
+       u64 wait_exits;
+       u64 cache_exits;
+       u64 signal_exits;
+       u64 int_exits;
+       u64 cop_unusable_exits;
+       u64 tlbmod_exits;
+       u64 tlbmiss_ld_exits;
+       u64 tlbmiss_st_exits;
+       u64 addrerr_st_exits;
+       u64 addrerr_ld_exits;
+       u64 syscall_exits;
+       u64 resvd_inst_exits;
+       u64 break_inst_exits;
+       u64 trap_inst_exits;
+       u64 msa_fpe_exits;
+       u64 fpe_exits;
+       u64 msa_disabled_exits;
+       u64 flush_dcache_exits;
+       u64 halt_successful_poll;
+       u64 halt_attempted_poll;
+       u64 halt_poll_invalid;
+       u64 halt_wakeup;
 };
 
 struct kvm_arch_memory_slot {
@@ -314,6 +328,9 @@ struct kvm_vcpu_arch {
        u32 guest_kernel_asid[NR_CPUS];
        struct mm_struct guest_kernel_mm, guest_user_mm;
 
+       /* Guest ASID of last user mode execution */
+       unsigned int last_user_gasid;
+
        int last_sched_cpu;
 
        /* WAIT executed */
index e788515..4db4c03 100644 (file)
@@ -846,6 +846,47 @@ enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu)
        return EMULATE_FAIL;
 }
 
+/**
+ * kvm_mips_invalidate_guest_tlb() - Indicates a change in guest MMU map.
+ * @vcpu:      VCPU with changed mappings.
+ * @tlb:       TLB entry being removed.
+ *
+ * This is called to indicate a single change in guest MMU mappings, so that we
+ * can arrange TLB flushes on this and other CPUs.
+ */
+static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu,
+                                         struct kvm_mips_tlb *tlb)
+{
+       int cpu, i;
+       bool user;
+
+       /* No need to flush for entries which are already invalid */
+       if (!((tlb->tlb_lo[0] | tlb->tlb_lo[1]) & ENTRYLO_V))
+               return;
+       /* User address space doesn't need flushing for KSeg2/3 changes */
+       user = tlb->tlb_hi < KVM_GUEST_KSEG0;
+
+       preempt_disable();
+
+       /*
+        * Probe the shadow host TLB for the entry being overwritten, if one
+        * matches, invalidate it
+        */
+       kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi);
+
+       /* Invalidate the whole ASID on other CPUs */
+       cpu = smp_processor_id();
+       for_each_possible_cpu(i) {
+               if (i == cpu)
+                       continue;
+               if (user)
+                       vcpu->arch.guest_user_asid[i] = 0;
+               vcpu->arch.guest_kernel_asid[i] = 0;
+       }
+
+       preempt_enable();
+}
+
 /* Write Guest TLB Entry @ Index */
 enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu)
 {
@@ -865,11 +906,8 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu)
        }
 
        tlb = &vcpu->arch.guest_tlb[index];
-       /*
-        * Probe the shadow host TLB for the entry being overwritten, if one
-        * matches, invalidate it
-        */
-       kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi);
+
+       kvm_mips_invalidate_guest_tlb(vcpu, tlb);
 
        tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0);
        tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0);
@@ -898,11 +936,7 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu)
 
        tlb = &vcpu->arch.guest_tlb[index];
 
-       /*
-        * Probe the shadow host TLB for the entry being overwritten, if one
-        * matches, invalidate it
-        */
-       kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi);
+       kvm_mips_invalidate_guest_tlb(vcpu, tlb);
 
        tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0);
        tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0);
@@ -1026,6 +1060,7 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
        enum emulation_result er = EMULATE_DONE;
        u32 rt, rd, sel;
        unsigned long curr_pc;
+       int cpu, i;
 
        /*
         * Update PC and hold onto current PC in case there is
@@ -1127,16 +1162,31 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
                        } else if (rd == MIPS_CP0_TLB_HI && sel == 0) {
                                u32 nasid =
                                        vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID;
-                               if ((KSEGX(vcpu->arch.gprs[rt]) != CKSEG0) &&
-                                   ((kvm_read_c0_guest_entryhi(cop0) &
+                               if (((kvm_read_c0_guest_entryhi(cop0) &
                                      KVM_ENTRYHI_ASID) != nasid)) {
                                        trace_kvm_asid_change(vcpu,
                                                kvm_read_c0_guest_entryhi(cop0)
                                                        & KVM_ENTRYHI_ASID,
                                                nasid);
 
-                                       /* Blow away the shadow host TLBs */
-                                       kvm_mips_flush_host_tlb(1);
+                                       /*
+                                        * Regenerate/invalidate kernel MMU
+                                        * context.
+                                        * The user MMU context will be
+                                        * regenerated lazily on re-entry to
+                                        * guest user if the guest ASID actually
+                                        * changes.
+                                        */
+                                       preempt_disable();
+                                       cpu = smp_processor_id();
+                                       kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm,
+                                                               cpu, vcpu);
+                                       vcpu->arch.guest_kernel_asid[cpu] =
+                                               vcpu->arch.guest_kernel_mm.context.asid[cpu];
+                                       for_each_possible_cpu(i)
+                                               if (i != cpu)
+                                                       vcpu->arch.guest_kernel_asid[i] = 0;
+                                       preempt_enable();
                                }
                                kvm_write_c0_guest_entryhi(cop0,
                                                           vcpu->arch.gprs[rt]);
index a6ea084..ce96149 100644 (file)
@@ -140,6 +140,16 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        return 0;
 }
 
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+       return false;
+}
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
 void kvm_mips_free_vcpus(struct kvm *kvm)
 {
        unsigned int i;
@@ -411,6 +421,31 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
        return -ENOIOCTLCMD;
 }
 
+/* Must be called with preemption disabled, just before entering guest */
+static void kvm_mips_check_asids(struct kvm_vcpu *vcpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       int cpu = smp_processor_id();
+       unsigned int gasid;
+
+       /*
+        * Lazy host ASID regeneration for guest user mode.
+        * If the guest ASID has changed since the last guest usermode
+        * execution, regenerate the host ASID so as to invalidate stale TLB
+        * entries.
+        */
+       if (!KVM_GUEST_KERNEL_MODE(vcpu)) {
+               gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID;
+               if (gasid != vcpu->arch.last_user_gasid) {
+                       kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu,
+                                               vcpu);
+                       vcpu->arch.guest_user_asid[cpu] =
+                               vcpu->arch.guest_user_mm.context.asid[cpu];
+                       vcpu->arch.last_user_gasid = gasid;
+               }
+       }
+}
+
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
        int r = 0;
@@ -438,6 +473,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
        htw_stop();
 
        trace_kvm_enter(vcpu);
+
+       kvm_mips_check_asids(vcpu);
+
        r = vcpu->arch.vcpu_run(run, vcpu);
        trace_kvm_out(vcpu);
 
@@ -1551,6 +1589,8 @@ skip_emul:
        if (ret == RESUME_GUEST) {
                trace_kvm_reenter(vcpu);
 
+               kvm_mips_check_asids(vcpu);
+
                /*
                 * If FPU / MSA are enabled (i.e. the guest's FPU / MSA context
                 * is live), restore FCR31 / MSACSR.
index 121008c..03883ba 100644 (file)
@@ -250,15 +250,27 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
                vcpu->arch.guest_kernel_asid[cpu] =
                    vcpu->arch.guest_kernel_mm.context.asid[cpu];
+               newasid++;
+
+               kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
+                         cpu_context(cpu, current->mm));
+               kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
+                         cpu, vcpu->arch.guest_kernel_asid[cpu]);
+       }
+
+       if ((vcpu->arch.guest_user_asid[cpu] ^ asid_cache(cpu)) &
+                                               asid_version_mask(cpu)) {
+               u32 gasid = kvm_read_c0_guest_entryhi(vcpu->arch.cop0) &
+                               KVM_ENTRYHI_ASID;
+
                kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
                vcpu->arch.guest_user_asid[cpu] =
                    vcpu->arch.guest_user_mm.context.asid[cpu];
+               vcpu->arch.last_user_gasid = gasid;
                newasid++;
 
                kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
                          cpu_context(cpu, current->mm));
-               kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
-                         cpu, vcpu->arch.guest_kernel_asid[cpu]);
                kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
                          vcpu->arch.guest_user_asid[cpu]);
        }
index 0915539..3a5484f 100644 (file)
@@ -175,6 +175,24 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
                        run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                        ret = RESUME_HOST;
                }
+       } else if (KVM_GUEST_KERNEL_MODE(vcpu)
+                  && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) {
+               /*
+                * With EVA we may get a TLB exception instead of an address
+                * error when the guest performs MMIO to KSeg1 addresses.
+                */
+               kvm_debug("Emulate %s MMIO space\n",
+                         store ? "Store to" : "Load from");
+               er = kvm_mips_emulate_inst(cause, opc, run, vcpu);
+               if (er == EMULATE_FAIL) {
+                       kvm_err("Emulate %s MMIO space failed\n",
+                               store ? "Store to" : "Load from");
+                       run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+                       ret = RESUME_HOST;
+               } else {
+                       run->exit_reason = KVM_EXIT_MMIO;
+                       ret = RESUME_HOST;
+               }
        } else {
                kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
                        store ? "ST" : "LD", cause, opc, badvaddr);
index 287a656..e407af2 100644 (file)
@@ -244,6 +244,43 @@ static inline int segment_shift(int ssize)
        return SID_SHIFT_1T;
 }
 
+/*
+ * This array is indexed by the LP field of the HPTE second dword.
+ * Since this field may contain some RPN bits, some entries are
+ * replicated so that we get the same value irrespective of RPN.
+ * The top 4 bits are the page size index (MMU_PAGE_*) for the
+ * actual page size, the bottom 4 bits are the base page size.
+ */
+extern u8 hpte_page_sizes[1 << LP_BITS];
+
+static inline unsigned long __hpte_page_size(unsigned long h, unsigned long l,
+                                            bool is_base_size)
+{
+       unsigned int i, lp;
+
+       if (!(h & HPTE_V_LARGE))
+               return 1ul << 12;
+
+       /* Look at the 8 bit LP value */
+       lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+       i = hpte_page_sizes[lp];
+       if (!i)
+               return 0;
+       if (!is_base_size)
+               i >>= 4;
+       return 1ul << mmu_psize_defs[i & 0xf].shift;
+}
+
+static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
+{
+       return __hpte_page_size(h, l, 0);
+}
+
+static inline unsigned long hpte_base_page_size(unsigned long h, unsigned long l)
+{
+       return __hpte_page_size(h, l, 1);
+}
+
 /*
  * The current system page and segment sizes
  */
index 2fd1690..f6fda84 100644 (file)
@@ -241,6 +241,35 @@ static inline void out_be64(volatile u64 __iomem *addr, u64 val)
 #endif
 #endif /* __powerpc64__ */
 
+
+/*
+ * Simple Cache inhibited accessors
+ * Unlike the DEF_MMIO_* macros, these don't include any h/w memory
+ * barriers, callers need to manage memory barriers on their own.
+ * These can only be used in hypervisor real mode.
+ */
+
+static inline u32 _lwzcix(unsigned long addr)
+{
+       u32 ret;
+
+       __asm__ __volatile__("lwzcix %0,0, %1"
+                            : "=r" (ret) : "r" (addr) : "memory");
+       return ret;
+}
+
+static inline void _stbcix(u64 addr, u8 val)
+{
+       __asm__ __volatile__("stbcix %0,0,%1"
+               : : "r" (val), "r" (addr) : "memory");
+}
+
+static inline void _stwcix(u64 addr, u32 val)
+{
+       __asm__ __volatile__("stwcix %0,0,%1"
+               : : "r" (val), "r" (addr) : "memory");
+}
+
 /*
  * Low level IO stream instructions are defined out of line for now
  */
index 5bca220..05cabed 100644 (file)
 #define BOOK3S_INTERRUPT_FAC_UNAVAIL   0xf60
 #define BOOK3S_INTERRUPT_H_FAC_UNAVAIL 0xf80
 
+/* book3s_hv */
+
+/*
+ * Special trap used to indicate to host that this is a
+ * passthrough interrupt that could not be handled
+ * completely in the guest.
+ */
+#define BOOK3S_INTERRUPT_HV_RM_HARD    0x5555
+
 #define BOOK3S_IRQPRIO_SYSTEM_RESET            0
 #define BOOK3S_IRQPRIO_DATA_SEGMENT            1
 #define BOOK3S_IRQPRIO_INST_SEGMENT            2
 #define RESUME_FLAG_NV          (1<<0)  /* Reload guest nonvolatile state? */
 #define RESUME_FLAG_HOST        (1<<1)  /* Resume host? */
 #define RESUME_FLAG_ARCH1      (1<<2)
+#define RESUME_FLAG_ARCH2      (1<<3)
 
 #define RESUME_GUEST            0
 #define RESUME_GUEST_NV         RESUME_FLAG_NV
index 8f39796..5cf306a 100644 (file)
@@ -69,6 +69,43 @@ struct hpte_cache {
        int pagesize;
 };
 
+/*
+ * Struct for a virtual core.
+ * Note: entry_exit_map combines a bitmap of threads that have entered
+ * in the bottom 8 bits and a bitmap of threads that have exited in the
+ * next 8 bits.  This is so that we can atomically set the entry bit
+ * iff the exit map is 0 without taking a lock.
+ */
+struct kvmppc_vcore {
+       int n_runnable;
+       int num_threads;
+       int entry_exit_map;
+       int napping_threads;
+       int first_vcpuid;
+       u16 pcpu;
+       u16 last_cpu;
+       u8 vcore_state;
+       u8 in_guest;
+       struct kvmppc_vcore *master_vcore;
+       struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS];
+       struct list_head preempt_list;
+       spinlock_t lock;
+       struct swait_queue_head wq;
+       spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
+       u64 stolen_tb;
+       u64 preempt_tb;
+       struct kvm_vcpu *runner;
+       struct kvm *kvm;
+       u64 tb_offset;          /* guest timebase - host timebase */
+       ulong lpcr;
+       u32 arch_compat;
+       ulong pcr;
+       ulong dpdes;            /* doorbell state (POWER8) */
+       ulong vtb;              /* virtual timebase */
+       ulong conferring_threads;
+       unsigned int halt_poll_ns;
+};
+
 struct kvmppc_vcpu_book3s {
        struct kvmppc_sid_map sid_map[SID_MAP_NUM];
        struct {
@@ -83,6 +120,7 @@ struct kvmppc_vcpu_book3s {
        u64 sdr1;
        u64 hior;
        u64 msr_mask;
+       u64 vtb;
 #ifdef CONFIG_PPC_BOOK3S_32
        u32 vsid_pool[VSID_POOL_SIZE];
        u32 vsid_next;
@@ -191,6 +229,7 @@ extern void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu,
                                 struct kvm_vcpu *vcpu);
 extern void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu,
                                   struct kvmppc_book3s_shadow_vcpu *svcpu);
+extern int kvm_irq_bypass;
 
 static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
 {
index 88d17b4..8482921 100644 (file)
@@ -20,6 +20,8 @@
 #ifndef __ASM_KVM_BOOK3S_64_H__
 #define __ASM_KVM_BOOK3S_64_H__
 
+#include <asm/book3s/64/mmu-hash.h>
+
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu)
 {
@@ -97,56 +99,20 @@ static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v)
        hpte[0] = cpu_to_be64(hpte_v);
 }
 
-static inline int __hpte_actual_psize(unsigned int lp, int psize)
-{
-       int i, shift;
-       unsigned int mask;
-
-       /* start from 1 ignoring MMU_PAGE_4K */
-       for (i = 1; i < MMU_PAGE_COUNT; i++) {
-
-               /* invalid penc */
-               if (mmu_psize_defs[psize].penc[i] == -1)
-                       continue;
-               /*
-                * encoding bits per actual page size
-                *        PTE LP     actual page size
-                *    rrrr rrrz         >=8KB
-                *    rrrr rrzz         >=16KB
-                *    rrrr rzzz         >=32KB
-                *    rrrr zzzz         >=64KB
-                * .......
-                */
-               shift = mmu_psize_defs[i].shift - LP_SHIFT;
-               if (shift > LP_BITS)
-                       shift = LP_BITS;
-               mask = (1 << shift) - 1;
-               if ((lp & mask) == mmu_psize_defs[psize].penc[i])
-                       return i;
-       }
-       return -1;
-}
-
 static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
                                             unsigned long pte_index)
 {
-       int b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K;
+       int i, b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K;
        unsigned int penc;
        unsigned long rb = 0, va_low, sllp;
        unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
 
        if (v & HPTE_V_LARGE) {
-               for (b_psize = 0; b_psize < MMU_PAGE_COUNT; b_psize++) {
-
-                       /* valid entries have a shift value */
-                       if (!mmu_psize_defs[b_psize].shift)
-                               continue;
-
-                       a_psize = __hpte_actual_psize(lp, b_psize);
-                       if (a_psize != -1)
-                               break;
-               }
+               i = hpte_page_sizes[lp];
+               b_psize = i & 0xf;
+               a_psize = i >> 4;
        }
+
        /*
         * Ignore the top 14 bits of va
         * v have top two bits covering segment size, hence move
@@ -159,7 +125,6 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
        /* This covers 14..54 bits of va*/
        rb = (v & ~0x7fUL) << 16;               /* AVA field */
 
-       rb |= (v >> HPTE_V_SSIZE_SHIFT) << 8;   /*  B field */
        /*
         * AVA in v had cleared lower 23 bits. We need to derive
         * that from pteg index
@@ -211,49 +176,10 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
                break;
        }
        }
-       rb |= (v >> 54) & 0x300;                /* B field */
+       rb |= (v >> HPTE_V_SSIZE_SHIFT) << 8;   /* B field */
        return rb;
 }
 
-static inline unsigned long __hpte_page_size(unsigned long h, unsigned long l,
-                                            bool is_base_size)
-{
-
-       int size, a_psize;
-       /* Look at the 8 bit LP value */
-       unsigned int lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
-
-       /* only handle 4k, 64k and 16M pages for now */
-       if (!(h & HPTE_V_LARGE))
-               return 1ul << 12;
-       else {
-               for (size = 0; size < MMU_PAGE_COUNT; size++) {
-                       /* valid entries have a shift value */
-                       if (!mmu_psize_defs[size].shift)
-                               continue;
-
-                       a_psize = __hpte_actual_psize(lp, size);
-                       if (a_psize != -1) {
-                               if (is_base_size)
-                                       return 1ul << mmu_psize_defs[size].shift;
-                               return 1ul << mmu_psize_defs[a_psize].shift;
-                       }
-               }
-
-       }
-       return 0;
-}
-
-static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
-{
-       return __hpte_page_size(h, l, 0);
-}
-
-static inline unsigned long hpte_base_page_size(unsigned long h, unsigned long l)
-{
-       return __hpte_page_size(h, l, 1);
-}
-
 static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
 {
        return ((ptel & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
index ec35af3..28350a2 100644 (file)
@@ -43,6 +43,8 @@
 #include <asm/cputhreads.h>
 #define KVM_MAX_VCPU_ID                (threads_per_subcore * KVM_MAX_VCORES)
 
+#define __KVM_HAVE_ARCH_INTC_INITIALIZED
+
 #ifdef CONFIG_KVM_MMIO
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #endif
@@ -95,42 +97,49 @@ struct kvmppc_vcpu_book3s;
 struct kvmppc_book3s_shadow_vcpu;
 
 struct kvm_vm_stat {
-       u32 remote_tlb_flush;
+       ulong remote_tlb_flush;
 };
 
 struct kvm_vcpu_stat {
-       u32 sum_exits;
-       u32 mmio_exits;
-       u32 signal_exits;
-       u32 light_exits;
+       u64 sum_exits;
+       u64 mmio_exits;
+       u64 signal_exits;
+       u64 light_exits;
        /* Account for special types of light exits: */
-       u32 itlb_real_miss_exits;
-       u32 itlb_virt_miss_exits;
-       u32 dtlb_real_miss_exits;
-       u32 dtlb_virt_miss_exits;
-       u32 syscall_exits;
-       u32 isi_exits;
-       u32 dsi_exits;
-       u32 emulated_inst_exits;
-       u32 dec_exits;
-       u32 ext_intr_exits;
-       u32 halt_successful_poll;
-       u32 halt_attempted_poll;
-       u32 halt_poll_invalid;
-       u32 halt_wakeup;
-       u32 dbell_exits;
-       u32 gdbell_exits;
-       u32 ld;
-       u32 st;
+       u64 itlb_real_miss_exits;
+       u64 itlb_virt_miss_exits;
+       u64 dtlb_real_miss_exits;
+       u64 dtlb_virt_miss_exits;
+       u64 syscall_exits;
+       u64 isi_exits;
+       u64 dsi_exits;
+       u64 emulated_inst_exits;
+       u64 dec_exits;
+       u64 ext_intr_exits;
+       u64 halt_poll_success_ns;
+       u64 halt_poll_fail_ns;
+       u64 halt_wait_ns;
+       u64 halt_successful_poll;
+       u64 halt_attempted_poll;
+       u64 halt_successful_wait;
+       u64 halt_poll_invalid;
+       u64 halt_wakeup;
+       u64 dbell_exits;
+       u64 gdbell_exits;
+       u64 ld;
+       u64 st;
 #ifdef CONFIG_PPC_BOOK3S
-       u32 pf_storage;
-       u32 pf_instruc;
-       u32 sp_storage;
-       u32 sp_instruc;
-       u32 queue_intr;
-       u32 ld_slow;
-       u32 st_slow;
+       u64 pf_storage;
+       u64 pf_instruc;
+       u64 sp_storage;
+       u64 sp_instruc;
+       u64 queue_intr;
+       u64 ld_slow;
+       u64 st_slow;
 #endif
+       u64 pthru_all;
+       u64 pthru_host;
+       u64 pthru_bad_aff;
 };
 
 enum kvm_exit_types {
@@ -197,6 +206,8 @@ struct kvmppc_spapr_tce_table {
 struct kvmppc_xics;
 struct kvmppc_icp;
 
+struct kvmppc_passthru_irqmap;
+
 /*
  * The reverse mapping array has one entry for each HPTE,
  * which stores the guest's view of the second word of the HPTE
@@ -267,6 +278,7 @@ struct kvm_arch {
 #endif
 #ifdef CONFIG_KVM_XICS
        struct kvmppc_xics *xics;
+       struct kvmppc_passthru_irqmap *pimap;
 #endif
        struct kvmppc_ops *kvm_ops;
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -275,41 +287,6 @@ struct kvm_arch {
 #endif
 };
 
-/*
- * Struct for a virtual core.
- * Note: entry_exit_map combines a bitmap of threads that have entered
- * in the bottom 8 bits and a bitmap of threads that have exited in the
- * next 8 bits.  This is so that we can atomically set the entry bit
- * iff the exit map is 0 without taking a lock.
- */
-struct kvmppc_vcore {
-       int n_runnable;
-       int num_threads;
-       int entry_exit_map;
-       int napping_threads;
-       int first_vcpuid;
-       u16 pcpu;
-       u16 last_cpu;
-       u8 vcore_state;
-       u8 in_guest;
-       struct kvmppc_vcore *master_vcore;
-       struct list_head runnable_threads;
-       struct list_head preempt_list;
-       spinlock_t lock;
-       struct swait_queue_head wq;
-       spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
-       u64 stolen_tb;
-       u64 preempt_tb;
-       struct kvm_vcpu *runner;
-       struct kvm *kvm;
-       u64 tb_offset;          /* guest timebase - host timebase */
-       ulong lpcr;
-       u32 arch_compat;
-       ulong pcr;
-       ulong dpdes;            /* doorbell state (POWER8) */
-       ulong conferring_threads;
-};
-
 #define VCORE_ENTRY_MAP(vc)    ((vc)->entry_exit_map & 0xff)
 #define VCORE_EXIT_MAP(vc)     ((vc)->entry_exit_map >> 8)
 #define VCORE_IS_EXITING(vc)   (VCORE_EXIT_MAP(vc) != 0)
@@ -329,6 +306,7 @@ struct kvmppc_vcore {
 #define VCORE_SLEEPING 3
 #define VCORE_RUNNING  4
 #define VCORE_EXITING  5
+#define VCORE_POLLING  6
 
 /*
  * Struct used to manage memory for a virtual processor area
@@ -397,6 +375,20 @@ struct kvmhv_tb_accumulator {
        u64     tb_max;         /* max time */
 };
 
+#ifdef CONFIG_PPC_BOOK3S_64
+struct kvmppc_irq_map {
+       u32     r_hwirq;
+       u32     v_hwirq;
+       struct irq_desc *desc;
+};
+
+#define        KVMPPC_PIRQ_MAPPED      1024
+struct kvmppc_passthru_irqmap {
+       int n_mapped;
+       struct kvmppc_irq_map mapped[KVMPPC_PIRQ_MAPPED];
+};
+#endif
+
 # ifdef CONFIG_PPC_FSL_BOOK3E
 #define KVMPPC_BOOKE_IAC_NUM   2
 #define KVMPPC_BOOKE_DAC_NUM   2
@@ -483,7 +475,6 @@ struct kvm_vcpu_arch {
        ulong purr;
        ulong spurr;
        ulong ic;
-       ulong vtb;
        ulong dscr;
        ulong amr;
        ulong uamor;
@@ -668,7 +659,6 @@ struct kvm_vcpu_arch {
        long pgfault_index;
        unsigned long pgfault_hpte[2];
 
-       struct list_head run_list;
        struct task_struct *run_task;
        struct kvm_run *kvm_run;
 
index 2544eda..f6e4964 100644 (file)
@@ -287,6 +287,10 @@ struct kvmppc_ops {
        long (*arch_vm_ioctl)(struct file *filp, unsigned int ioctl,
                              unsigned long arg);
        int (*hcall_implemented)(unsigned long hcall);
+       int (*irq_bypass_add_producer)(struct irq_bypass_consumer *,
+                                      struct irq_bypass_producer *);
+       void (*irq_bypass_del_producer)(struct irq_bypass_consumer *,
+                                       struct irq_bypass_producer *);
 };
 
 extern struct kvmppc_ops *kvmppc_hv_ops;
@@ -453,8 +457,19 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.irq_type == KVMPPC_IRQ_XICS;
 }
+
+static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
+                               struct kvm *kvm)
+{
+       if (kvm && kvm_irq_bypass)
+               return kvm->arch.pimap;
+       return NULL;
+}
+
 extern void kvmppc_alloc_host_rm_ops(void);
 extern void kvmppc_free_host_rm_ops(void);
+extern void kvmppc_free_pimap(struct kvm *kvm);
+extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall);
 extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
 extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
@@ -464,10 +479,23 @@ extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
 extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
                        struct kvm_vcpu *vcpu, u32 cpu);
 extern void kvmppc_xics_ipi_action(void);
+extern void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long guest_irq,
+                                  unsigned long host_irq);
+extern void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
+                                  unsigned long host_irq);
+extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr,
+                                struct kvmppc_irq_map *irq_map,
+                                struct kvmppc_passthru_irqmap *pimap);
 extern int h_ipi_redirect;
 #else
+static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
+                               struct kvm *kvm)
+       { return NULL; }
 static inline void kvmppc_alloc_host_rm_ops(void) {};
 static inline void kvmppc_free_host_rm_ops(void) {};
+static inline void kvmppc_free_pimap(struct kvm *kvm) {};
+static inline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
+       { return 0; }
 static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
        { return 0; }
 static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
index e2fb408..b78e8d3 100644 (file)
@@ -271,6 +271,7 @@ static inline bool early_radix_enabled(void)
 #define MMU_PAGE_16G   13
 #define MMU_PAGE_64G   14
 
+/* N.B. we need to change the type of hpte_page_sizes if this gets to be > 16 */
 #define MMU_PAGE_COUNT 15
 
 #ifdef CONFIG_PPC_BOOK3S_64
index ee05bd2..e958b70 100644 (file)
@@ -67,6 +67,7 @@ int64_t opal_pci_config_write_half_word(uint64_t phb_id, uint64_t bus_dev_func,
 int64_t opal_pci_config_write_word(uint64_t phb_id, uint64_t bus_dev_func,
                                   uint64_t offset, uint32_t data);
 int64_t opal_set_xive(uint32_t isn, uint16_t server, uint8_t priority);
+int64_t opal_rm_set_xive(uint32_t isn, uint16_t server, uint8_t priority);
 int64_t opal_get_xive(uint32_t isn, __be16 *server, uint8_t *priority);
 int64_t opal_register_exception_handler(uint64_t opal_exception,
                                        uint64_t handler_address,
index 0cbd813..1b46b52 100644 (file)
@@ -12,6 +12,7 @@
 
 #include <linux/pci.h>
 #include <linux/pci_hotplug.h>
+#include <linux/irq.h>
 #include <misc/cxl-base.h>
 #include <asm/opal-api.h>
 
@@ -33,6 +34,8 @@ int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num);
 void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num);
 int pnv_cxl_get_irq_count(struct pci_dev *dev);
 struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev);
+int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq);
+bool is_pnv_opal_msi(struct irq_chip *chip);
 
 #ifdef CONFIG_CXL_BASE
 int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs,
index f69f40f..978dada 100644 (file)
 #define   MMCR0_FCHV   0x00000001UL /* freeze conditions in hypervisor mode */
 #define SPRN_MMCR1     798
 #define SPRN_MMCR2     785
+#define SPRN_UMMCR2    769
 #define SPRN_MMCRA     0x312
 #define   MMCRA_SDSYNC 0x80000000UL /* SDAR synced with SIAR */
 #define   MMCRA_SDAR_DCACHE_MISS 0x40000000UL
index b89d14c..a51ae9b 100644 (file)
@@ -506,7 +506,6 @@ int main(void)
        DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
        DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
        DEFINE(VCPU_IC, offsetof(struct kvm_vcpu, arch.ic));
-       DEFINE(VCPU_VTB, offsetof(struct kvm_vcpu, arch.vtb));
        DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
        DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr));
        DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
@@ -557,6 +556,7 @@ int main(void)
        DEFINE(VCORE_LPCR, offsetof(struct kvmppc_vcore, lpcr));
        DEFINE(VCORE_PCR, offsetof(struct kvmppc_vcore, pcr));
        DEFINE(VCORE_DPDES, offsetof(struct kvmppc_vcore, dpdes));
+       DEFINE(VCORE_VTB, offsetof(struct kvmppc_vcore, vtb));
        DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
        DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv));
        DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb));
index c2024ac..029be26 100644 (file)
@@ -22,6 +22,9 @@ config KVM
        select ANON_INODES
        select HAVE_KVM_EVENTFD
        select SRCU
+       select KVM_VFIO
+       select IRQ_BYPASS_MANAGER
+       select HAVE_KVM_IRQ_BYPASS
 
 config KVM_BOOK3S_HANDLER
        bool
index 855d4b9..7dd89b7 100644 (file)
@@ -7,16 +7,16 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
 ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
 KVM := ../../../virt/kvm
 
-common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
-               $(KVM)/eventfd.o
+common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o
 common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
+common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
 
 CFLAGS_e500_mmu.o := -I.
 CFLAGS_e500_mmu_host.o := -I.
 CFLAGS_emulate.o  := -I.
 CFLAGS_emulate_loadstore.o  := -I.
 
-common-objs-y += powerpc.o emulate.o emulate_loadstore.o
+common-objs-y += powerpc.o emulate_loadstore.o
 obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
 obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o
 
@@ -24,6 +24,7 @@ AFLAGS_booke_interrupts.o := -I$(objtree)/$(obj)
 
 kvm-e500-objs := \
        $(common-objs-y) \
+       emulate.o \
        booke.o \
        booke_emulate.o \
        booke_interrupts.o \
@@ -35,6 +36,7 @@ kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs)
 
 kvm-e500mc-objs := \
        $(common-objs-y) \
+       emulate.o \
        booke.o \
        booke_emulate.o \
        bookehv_interrupts.o \
@@ -61,9 +63,6 @@ kvm-pr-y := \
        book3s_32_mmu.o
 
 ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
-kvm-book3s_64-module-objs := \
-       $(KVM)/coalesced_mmio.o
-
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
        book3s_rmhandlers.o
 endif
@@ -89,11 +88,8 @@ endif
 kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
        book3s_xics.o
 
-kvm-book3s_64-module-objs += \
-       $(KVM)/kvm_main.o \
-       $(KVM)/eventfd.o \
-       powerpc.o \
-       emulate_loadstore.o \
+kvm-book3s_64-module-objs := \
+       $(common-objs-y) \
        book3s.o \
        book3s_64_vio.o \
        book3s_rtas.o \
@@ -103,6 +99,7 @@ kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
 
 kvm-book3s_32-objs := \
        $(common-objs-y) \
+       emulate.o \
        fpu.o \
        book3s_paired_singles.o \
        book3s.o \
index 47018fc..b6952dd 100644 (file)
@@ -52,8 +52,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "dec",         VCPU_STAT(dec_exits) },
        { "ext_intr",    VCPU_STAT(ext_intr_exits) },
        { "queue_intr",  VCPU_STAT(queue_intr) },
+       { "halt_poll_success_ns",       VCPU_STAT(halt_poll_success_ns) },
+       { "halt_poll_fail_ns",          VCPU_STAT(halt_poll_fail_ns) },
+       { "halt_wait_ns",               VCPU_STAT(halt_wait_ns) },
        { "halt_successful_poll", VCPU_STAT(halt_successful_poll), },
        { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), },
+       { "halt_successful_wait",       VCPU_STAT(halt_successful_wait) },
        { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
        { "halt_wakeup", VCPU_STAT(halt_wakeup) },
        { "pf_storage",  VCPU_STAT(pf_storage) },
@@ -64,6 +68,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "ld_slow",     VCPU_STAT(ld_slow) },
        { "st",          VCPU_STAT(st) },
        { "st_slow",     VCPU_STAT(st_slow) },
+       { "pthru_all",       VCPU_STAT(pthru_all) },
+       { "pthru_host",      VCPU_STAT(pthru_host) },
+       { "pthru_bad_aff",   VCPU_STAT(pthru_bad_aff) },
        { NULL }
 };
 
@@ -592,9 +599,6 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
                case KVM_REG_PPC_BESCR:
                        *val = get_reg_val(id, vcpu->arch.bescr);
                        break;
-               case KVM_REG_PPC_VTB:
-                       *val = get_reg_val(id, vcpu->arch.vtb);
-                       break;
                case KVM_REG_PPC_IC:
                        *val = get_reg_val(id, vcpu->arch.ic);
                        break;
@@ -666,9 +670,6 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
                case KVM_REG_PPC_BESCR:
                        vcpu->arch.bescr = set_reg_val(id, *val);
                        break;
-               case KVM_REG_PPC_VTB:
-                       vcpu->arch.vtb = set_reg_val(id, *val);
-                       break;
                case KVM_REG_PPC_IC:
                        vcpu->arch.ic = set_reg_val(id, *val);
                        break;
index 2afdb9c..8359752 100644 (file)
@@ -498,6 +498,7 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
        case SPRN_MMCR0:
        case SPRN_MMCR1:
        case SPRN_MMCR2:
+       case SPRN_UMMCR2:
 #endif
                break;
 unprivileged:
@@ -579,7 +580,7 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val
                *spr_val = vcpu->arch.spurr;
                break;
        case SPRN_VTB:
-               *spr_val = vcpu->arch.vtb;
+               *spr_val = to_book3s(vcpu)->vtb;
                break;
        case SPRN_IC:
                *spr_val = vcpu->arch.ic;
@@ -640,6 +641,7 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val
        case SPRN_MMCR0:
        case SPRN_MMCR1:
        case SPRN_MMCR2:
+       case SPRN_UMMCR2:
        case SPRN_TIR:
 #endif
                *spr_val = 0;
index 2fd5580..3686471 100644 (file)
 #include <asm/smp.h>
 #include <asm/dbell.h>
 #include <asm/hmi.h>
+#include <asm/pnv-pci.h>
 #include <linux/gfp.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 #include <linux/hugetlb.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
 #include <linux/module.h>
+#include <linux/compiler.h>
 
 #include "book3s.h"
 
@@ -70,6 +74,8 @@
 
 /* Used to indicate that a guest page fault needs to be handled */
 #define RESUME_PAGE_FAULT      (RESUME_GUEST | RESUME_FLAG_ARCH1)
+/* Used to indicate that a guest passthrough interrupt needs to be handled */
+#define RESUME_PASSTHROUGH     (RESUME_GUEST | RESUME_FLAG_ARCH2)
 
 /* Used as a "null" value for timebase values */
 #define TB_NIL (~(u64)0)
@@ -89,14 +95,55 @@ static struct kernel_param_ops module_param_ops = {
        .get = param_get_int,
 };
 
+module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass,
+                                                       S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization");
+
 module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
                                                        S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
 #endif
 
+/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */
+static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT;
+module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns");
+
+/* Factor by which the vcore halt poll interval is grown, default is to double
+ */
+static unsigned int halt_poll_ns_grow = 2;
+module_param(halt_poll_ns_grow, int, S_IRUGO);
+MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by");
+
+/* Factor by which the vcore halt poll interval is shrunk, default is to reset
+ */
+static unsigned int halt_poll_ns_shrink;
+module_param(halt_poll_ns_shrink, int, S_IRUGO);
+MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by");
+
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
+static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
+               int *ip)
+{
+       int i = *ip;
+       struct kvm_vcpu *vcpu;
+
+       while (++i < MAX_SMT_THREADS) {
+               vcpu = READ_ONCE(vc->runnable_threads[i]);
+               if (vcpu) {
+                       *ip = i;
+                       return vcpu;
+               }
+       }
+       return NULL;
+}
+
+/* Used to traverse the list of runnable threads for a given vcore */
+#define for_each_runnable_thread(i, vcpu, vc) \
+       for (i = -1; (vcpu = next_runnable_thread(vc, &i)); )
+
 static bool kvmppc_ipi_thread(int cpu)
 {
        /* On POWER8 for IPIs to threads in the same core, use msgsnd */
@@ -991,6 +1038,9 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
                kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
                r = RESUME_GUEST;
                break;
+       case BOOK3S_INTERRUPT_HV_RM_HARD:
+               r = RESUME_PASSTHROUGH;
+               break;
        default:
                kvmppc_dump_regs(vcpu);
                printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
@@ -1149,6 +1199,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
        case KVM_REG_PPC_DPDES:
                *val = get_reg_val(id, vcpu->arch.vcore->dpdes);
                break;
+       case KVM_REG_PPC_VTB:
+               *val = get_reg_val(id, vcpu->arch.vcore->vtb);
+               break;
        case KVM_REG_PPC_DAWR:
                *val = get_reg_val(id, vcpu->arch.dawr);
                break;
@@ -1341,6 +1394,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
        case KVM_REG_PPC_DPDES:
                vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
                break;
+       case KVM_REG_PPC_VTB:
+               vcpu->arch.vcore->vtb = set_reg_val(id, *val);
+               break;
        case KVM_REG_PPC_DAWR:
                vcpu->arch.dawr = set_reg_val(id, *val);
                break;
@@ -1493,7 +1549,6 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
        if (vcore == NULL)
                return NULL;
 
-       INIT_LIST_HEAD(&vcore->runnable_threads);
        spin_lock_init(&vcore->lock);
        spin_lock_init(&vcore->stoltb_lock);
        init_swait_queue_head(&vcore->wq);
@@ -1802,7 +1857,7 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
        vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
        spin_unlock_irq(&vcpu->arch.tbacct_lock);
        --vc->n_runnable;
-       list_del(&vcpu->arch.run_list);
+       WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL);
 }
 
 static int kvmppc_grab_hwthread(int cpu)
@@ -2048,66 +2103,6 @@ static void init_master_vcore(struct kvmppc_vcore *vc)
        vc->conferring_threads = 0;
 }
 
-/*
- * See if the existing subcores can be split into 3 (or fewer) subcores
- * of at most two threads each, so we can fit in another vcore.  This
- * assumes there are at most two subcores and at most 6 threads in total.
- */
-static bool can_split_piggybacked_subcores(struct core_info *cip)
-{
-       int sub, new_sub;
-       int large_sub = -1;
-       int thr;
-       int n_subcores = cip->n_subcores;
-       struct kvmppc_vcore *vc, *vcnext;
-       struct kvmppc_vcore *master_vc = NULL;
-
-       for (sub = 0; sub < cip->n_subcores; ++sub) {
-               if (cip->subcore_threads[sub] <= 2)
-                       continue;
-               if (large_sub >= 0)
-                       return false;
-               large_sub = sub;
-               vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore,
-                                     preempt_list);
-               if (vc->num_threads > 2)
-                       return false;
-               n_subcores += (cip->subcore_threads[sub] - 1) >> 1;
-       }
-       if (large_sub < 0 || !subcore_config_ok(n_subcores + 1, 2))
-               return false;
-
-       /*
-        * Seems feasible, so go through and move vcores to new subcores.
-        * Note that when we have two or more vcores in one subcore,
-        * all those vcores must have only one thread each.
-        */
-       new_sub = cip->n_subcores;
-       thr = 0;
-       sub = large_sub;
-       list_for_each_entry_safe(vc, vcnext, &cip->vcs[sub], preempt_list) {
-               if (thr >= 2) {
-                       list_del(&vc->preempt_list);
-                       list_add_tail(&vc->preempt_list, &cip->vcs[new_sub]);
-                       /* vc->num_threads must be 1 */
-                       if (++cip->subcore_threads[new_sub] == 1) {
-                               cip->subcore_vm[new_sub] = vc->kvm;
-                               init_master_vcore(vc);
-                               master_vc = vc;
-                               ++cip->n_subcores;
-                       } else {
-                               vc->master_vcore = master_vc;
-                               ++new_sub;
-                       }
-               }
-               thr += vc->num_threads;
-       }
-       cip->subcore_threads[large_sub] = 2;
-       cip->max_subcore_threads = 2;
-
-       return true;
-}
-
 static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
 {
        int n_threads = vc->num_threads;
@@ -2118,23 +2113,9 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
 
        if (n_threads < cip->max_subcore_threads)
                n_threads = cip->max_subcore_threads;
-       if (subcore_config_ok(cip->n_subcores + 1, n_threads)) {
-               cip->max_subcore_threads = n_threads;
-       } else if (cip->n_subcores <= 2 && cip->total_threads <= 6 &&
-                  vc->num_threads <= 2) {
-               /*
-                * We may be able to fit another subcore in by
-                * splitting an existing subcore with 3 or 4
-                * threads into two 2-thread subcores, or one
-                * with 5 or 6 threads into three subcores.
-                * We can only do this if those subcores have
-                * piggybacked virtual cores.
-                */
-               if (!can_split_piggybacked_subcores(cip))
-                       return false;
-       } else {
+       if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
                return false;
-       }
+       cip->max_subcore_threads = n_threads;
 
        sub = cip->n_subcores;
        ++cip->n_subcores;
@@ -2148,43 +2129,6 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
        return true;
 }
 
-static bool can_piggyback_subcore(struct kvmppc_vcore *pvc,
-                                 struct core_info *cip, int sub)
-{
-       struct kvmppc_vcore *vc;
-       int n_thr;
-
-       vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore,
-                             preempt_list);
-
-       /* require same VM and same per-core reg values */
-       if (pvc->kvm != vc->kvm ||
-           pvc->tb_offset != vc->tb_offset ||
-           pvc->pcr != vc->pcr ||
-           pvc->lpcr != vc->lpcr)
-               return false;
-
-       /* P8 guest with > 1 thread per core would see wrong TIR value */
-       if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
-           (vc->num_threads > 1 || pvc->num_threads > 1))
-               return false;
-
-       n_thr = cip->subcore_threads[sub] + pvc->num_threads;
-       if (n_thr > cip->max_subcore_threads) {
-               if (!subcore_config_ok(cip->n_subcores, n_thr))
-                       return false;
-               cip->max_subcore_threads = n_thr;
-       }
-
-       cip->total_threads += pvc->num_threads;
-       cip->subcore_threads[sub] = n_thr;
-       pvc->master_vcore = vc;
-       list_del(&pvc->preempt_list);
-       list_add_tail(&pvc->preempt_list, &cip->vcs[sub]);
-
-       return true;
-}
-
 /*
  * Work out whether it is possible to piggyback the execution of
  * vcore *pvc onto the execution of the other vcores described in *cip.
@@ -2192,27 +2136,18 @@ static bool can_piggyback_subcore(struct kvmppc_vcore *pvc,
 static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip,
                          int target_threads)
 {
-       int sub;
-
        if (cip->total_threads + pvc->num_threads > target_threads)
                return false;
-       for (sub = 0; sub < cip->n_subcores; ++sub)
-               if (cip->subcore_threads[sub] &&
-                   can_piggyback_subcore(pvc, cip, sub))
-                       return true;
-
-       if (can_dynamic_split(pvc, cip))
-               return true;
 
-       return false;
+       return can_dynamic_split(pvc, cip);
 }
 
 static void prepare_threads(struct kvmppc_vcore *vc)
 {
-       struct kvm_vcpu *vcpu, *vnext;
+       int i;
+       struct kvm_vcpu *vcpu;
 
-       list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
-                                arch.run_list) {
+       for_each_runnable_thread(i, vcpu, vc) {
                if (signal_pending(vcpu->arch.run_task))
                        vcpu->arch.ret = -EINTR;
                else if (vcpu->arch.vpa.update_pending ||
@@ -2259,15 +2194,14 @@ static void collect_piggybacks(struct core_info *cip, int target_threads)
 
 static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
 {
-       int still_running = 0;
+       int still_running = 0, i;
        u64 now;
        long ret;
-       struct kvm_vcpu *vcpu, *vnext;
+       struct kvm_vcpu *vcpu;
 
        spin_lock(&vc->lock);
        now = get_tb();
-       list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
-                                arch.run_list) {
+       for_each_runnable_thread(i, vcpu, vc) {
                /* cancel pending dec exception if dec is positive */
                if (now < vcpu->arch.dec_expires &&
                    kvmppc_core_pending_dec(vcpu))
@@ -2307,8 +2241,8 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
                }
                if (vc->n_runnable > 0 && vc->runner == NULL) {
                        /* make sure there's a candidate runner awake */
-                       vcpu = list_first_entry(&vc->runnable_threads,
-                                               struct kvm_vcpu, arch.run_list);
+                       i = -1;
+                       vcpu = next_runnable_thread(vc, &i);
                        wake_up(&vcpu->arch.cpu_run);
                }
        }
@@ -2361,7 +2295,7 @@ static inline void kvmppc_set_host_core(int cpu)
  */
 static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 {
-       struct kvm_vcpu *vcpu, *vnext;
+       struct kvm_vcpu *vcpu;
        int i;
        int srcu_idx;
        struct core_info core_info;
@@ -2397,8 +2331,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
         */
        if ((threads_per_core > 1) &&
            ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
-               list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
-                                        arch.run_list) {
+               for_each_runnable_thread(i, vcpu, vc) {
                        vcpu->arch.ret = -EBUSY;
                        kvmppc_remove_runnable(vc, vcpu);
                        wake_up(&vcpu->arch.cpu_run);
@@ -2477,8 +2410,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                active |= 1 << thr;
                list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) {
                        pvc->pcpu = pcpu + thr;
-                       list_for_each_entry(vcpu, &pvc->runnable_threads,
-                                           arch.run_list) {
+                       for_each_runnable_thread(i, vcpu, pvc) {
                                kvmppc_start_thread(vcpu, pvc);
                                kvmppc_create_dtl_entry(vcpu, pvc);
                                trace_kvm_guest_enter(vcpu);
@@ -2604,34 +2536,92 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
        finish_wait(&vcpu->arch.cpu_run, &wait);
 }
 
+static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
+{
+       /* 10us base */
+       if (vc->halt_poll_ns == 0 && halt_poll_ns_grow)
+               vc->halt_poll_ns = 10000;
+       else
+               vc->halt_poll_ns *= halt_poll_ns_grow;
+
+       if (vc->halt_poll_ns > halt_poll_max_ns)
+               vc->halt_poll_ns = halt_poll_max_ns;
+}
+
+static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
+{
+       if (halt_poll_ns_shrink == 0)
+               vc->halt_poll_ns = 0;
+       else
+               vc->halt_poll_ns /= halt_poll_ns_shrink;
+}
+
+/* Check to see if any of the runnable vcpus on the vcore have pending
+ * exceptions or are no longer ceded
+ */
+static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
+{
+       struct kvm_vcpu *vcpu;
+       int i;
+
+       for_each_runnable_thread(i, vcpu, vc) {
+               if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded)
+                       return 1;
+       }
+
+       return 0;
+}
+
 /*
  * All the vcpus in this vcore are idle, so wait for a decrementer
  * or external interrupt to one of the vcpus.  vc->lock is held.
  */
 static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 {
-       struct kvm_vcpu *vcpu;
+       ktime_t cur, start_poll, start_wait;
        int do_sleep = 1;
+       u64 block_ns;
        DECLARE_SWAITQUEUE(wait);
 
-       prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+       /* Poll for pending exceptions and ceded state */
+       cur = start_poll = ktime_get();
+       if (vc->halt_poll_ns) {
+               ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
+               ++vc->runner->stat.halt_attempted_poll;
 
-       /*
-        * Check one last time for pending exceptions and ceded state after
-        * we put ourselves on the wait queue
-        */
-       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
-               if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) {
-                       do_sleep = 0;
-                       break;
+               vc->vcore_state = VCORE_POLLING;
+               spin_unlock(&vc->lock);
+
+               do {
+                       if (kvmppc_vcore_check_block(vc)) {
+                               do_sleep = 0;
+                               break;
+                       }
+                       cur = ktime_get();
+               } while (single_task_running() && ktime_before(cur, stop));
+
+               spin_lock(&vc->lock);
+               vc->vcore_state = VCORE_INACTIVE;
+
+               if (!do_sleep) {
+                       ++vc->runner->stat.halt_successful_poll;
+                       goto out;
                }
        }
 
-       if (!do_sleep) {
+       prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+
+       if (kvmppc_vcore_check_block(vc)) {
                finish_swait(&vc->wq, &wait);
-               return;
+               do_sleep = 0;
+               /* If we polled, count this as a successful poll */
+               if (vc->halt_poll_ns)
+                       ++vc->runner->stat.halt_successful_poll;
+               goto out;
        }
 
+       start_wait = ktime_get();
+
        vc->vcore_state = VCORE_SLEEPING;
        trace_kvmppc_vcore_blocked(vc, 0);
        spin_unlock(&vc->lock);
@@ -2640,13 +2630,52 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
        spin_lock(&vc->lock);
        vc->vcore_state = VCORE_INACTIVE;
        trace_kvmppc_vcore_blocked(vc, 1);
+       ++vc->runner->stat.halt_successful_wait;
+
+       cur = ktime_get();
+
+out:
+       block_ns = ktime_to_ns(cur) - ktime_to_ns(start_poll);
+
+       /* Attribute wait time */
+       if (do_sleep) {
+               vc->runner->stat.halt_wait_ns +=
+                       ktime_to_ns(cur) - ktime_to_ns(start_wait);
+               /* Attribute failed poll time */
+               if (vc->halt_poll_ns)
+                       vc->runner->stat.halt_poll_fail_ns +=
+                               ktime_to_ns(start_wait) -
+                               ktime_to_ns(start_poll);
+       } else {
+               /* Attribute successful poll time */
+               if (vc->halt_poll_ns)
+                       vc->runner->stat.halt_poll_success_ns +=
+                               ktime_to_ns(cur) -
+                               ktime_to_ns(start_poll);
+       }
+
+       /* Adjust poll time */
+       if (halt_poll_max_ns) {
+               if (block_ns <= vc->halt_poll_ns)
+                       ;
+               /* We slept and blocked for longer than the max halt time */
+               else if (vc->halt_poll_ns && block_ns > halt_poll_max_ns)
+                       shrink_halt_poll_ns(vc);
+               /* We slept and our poll time is too small */
+               else if (vc->halt_poll_ns < halt_poll_max_ns &&
+                               block_ns < halt_poll_max_ns)
+                       grow_halt_poll_ns(vc);
+       } else
+               vc->halt_poll_ns = 0;
+
+       trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
 }
 
 static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
-       int n_ceded;
+       int n_ceded, i;
        struct kvmppc_vcore *vc;
-       struct kvm_vcpu *v, *vn;
+       struct kvm_vcpu *v;
 
        trace_kvmppc_run_vcpu_enter(vcpu);
 
@@ -2666,7 +2695,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
        vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
        vcpu->arch.busy_preempt = TB_NIL;
-       list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
+       WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], vcpu);
        ++vc->n_runnable;
 
        /*
@@ -2706,8 +2735,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                        kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE);
                        continue;
                }
-               list_for_each_entry_safe(v, vn, &vc->runnable_threads,
-                                        arch.run_list) {
+               for_each_runnable_thread(i, v, vc) {
                        kvmppc_core_prepare_to_enter(v);
                        if (signal_pending(v->arch.run_task)) {
                                kvmppc_remove_runnable(vc, v);
@@ -2720,7 +2748,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
                        break;
                n_ceded = 0;
-               list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
+               for_each_runnable_thread(i, v, vc) {
                        if (!v->arch.pending_exceptions)
                                n_ceded += v->arch.ceded;
                        else
@@ -2759,8 +2787,8 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
        if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
                /* Wake up some vcpu to run the core */
-               v = list_first_entry(&vc->runnable_threads,
-                                    struct kvm_vcpu, arch.run_list);
+               i = -1;
+               v = next_runnable_thread(vc, &i);
                wake_up(&v->arch.cpu_run);
        }
 
@@ -2818,7 +2846,8 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        r = kvmppc_book3s_hv_page_fault(run, vcpu,
                                vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
                        srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
-               }
+               } else if (r == RESUME_PASSTHROUGH)
+                       r = kvmppc_xics_rm_complete(vcpu, 0);
        } while (is_kvmppc_resume_guest(r));
 
  out:
@@ -3247,6 +3276,8 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
        kvmppc_free_vcores(kvm);
 
        kvmppc_free_hpt(kvm);
+
+       kvmppc_free_pimap(kvm);
 }
 
 /* We don't need to emulate any privileged instructions or dcbz */
@@ -3282,6 +3313,184 @@ static int kvmppc_core_check_processor_compat_hv(void)
        return 0;
 }
 
+#ifdef CONFIG_KVM_XICS
+
+void kvmppc_free_pimap(struct kvm *kvm)
+{
+       kfree(kvm->arch.pimap);
+}
+
+static struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void)
+{
+       return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL);
+}
+
+static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
+{
+       struct irq_desc *desc;
+       struct kvmppc_irq_map *irq_map;
+       struct kvmppc_passthru_irqmap *pimap;
+       struct irq_chip *chip;
+       int i;
+
+       if (!kvm_irq_bypass)
+               return 1;
+
+       desc = irq_to_desc(host_irq);
+       if (!desc)
+               return -EIO;
+
+       mutex_lock(&kvm->lock);
+
+       pimap = kvm->arch.pimap;
+       if (pimap == NULL) {
+               /* First call, allocate structure to hold IRQ map */
+               pimap = kvmppc_alloc_pimap();
+               if (pimap == NULL) {
+                       mutex_unlock(&kvm->lock);
+                       return -ENOMEM;
+               }
+               kvm->arch.pimap = pimap;
+       }
+
+       /*
+        * For now, we only support interrupts for which the EOI operation
+        * is an OPAL call followed by a write to XIRR, since that's
+        * what our real-mode EOI code does.
+        */
+       chip = irq_data_get_irq_chip(&desc->irq_data);
+       if (!chip || !is_pnv_opal_msi(chip)) {
+               pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
+                       host_irq, guest_gsi);
+               mutex_unlock(&kvm->lock);
+               return -ENOENT;
+       }
+
+       /*
+        * See if we already have an entry for this guest IRQ number.
+        * If it's mapped to a hardware IRQ number, that's an error,
+        * otherwise re-use this entry.
+        */
+       for (i = 0; i < pimap->n_mapped; i++) {
+               if (guest_gsi == pimap->mapped[i].v_hwirq) {
+                       if (pimap->mapped[i].r_hwirq) {
+                               mutex_unlock(&kvm->lock);
+                               return -EINVAL;
+                       }
+                       break;
+               }
+       }
+
+       if (i == KVMPPC_PIRQ_MAPPED) {
+               mutex_unlock(&kvm->lock);
+               return -EAGAIN;         /* table is full */
+       }
+
+       irq_map = &pimap->mapped[i];
+
+       irq_map->v_hwirq = guest_gsi;
+       irq_map->desc = desc;
+
+       /*
+        * Order the above two stores before the next to serialize with
+        * the KVM real mode handler.
+        */
+       smp_wmb();
+       irq_map->r_hwirq = desc->irq_data.hwirq;
+
+       if (i == pimap->n_mapped)
+               pimap->n_mapped++;
+
+       kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
+
+       mutex_unlock(&kvm->lock);
+
+       return 0;
+}
+
+static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
+{
+       struct irq_desc *desc;
+       struct kvmppc_passthru_irqmap *pimap;
+       int i;
+
+       if (!kvm_irq_bypass)
+               return 0;
+
+       desc = irq_to_desc(host_irq);
+       if (!desc)
+               return -EIO;
+
+       mutex_lock(&kvm->lock);
+
+       if (kvm->arch.pimap == NULL) {
+               mutex_unlock(&kvm->lock);
+               return 0;
+       }
+       pimap = kvm->arch.pimap;
+
+       for (i = 0; i < pimap->n_mapped; i++) {
+               if (guest_gsi == pimap->mapped[i].v_hwirq)
+                       break;
+       }
+
+       if (i == pimap->n_mapped) {
+               mutex_unlock(&kvm->lock);
+               return -ENODEV;
+       }
+
+       kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
+
+       /* invalidate the entry */
+       pimap->mapped[i].r_hwirq = 0;
+
+       /*
+        * We don't free this structure even when the count goes to
+        * zero. The structure is freed when we destroy the VM.
+        */
+
+       mutex_unlock(&kvm->lock);
+       return 0;
+}
+
+static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons,
+                                            struct irq_bypass_producer *prod)
+{
+       int ret = 0;
+       struct kvm_kernel_irqfd *irqfd =
+               container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+       irqfd->producer = prod;
+
+       ret = kvmppc_set_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
+       if (ret)
+               pr_info("kvmppc_set_passthru_irq (irq %d, gsi %d) fails: %d\n",
+                       prod->irq, irqfd->gsi, ret);
+
+       return ret;
+}
+
+static void kvmppc_irq_bypass_del_producer_hv(struct irq_bypass_consumer *cons,
+                                             struct irq_bypass_producer *prod)
+{
+       int ret;
+       struct kvm_kernel_irqfd *irqfd =
+               container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+       irqfd->producer = NULL;
+
+       /*
+        * When producer of consumer is unregistered, we change back to
+        * default external interrupt handling mode - KVM real mode
+        * will switch back to host.
+        */
+       ret = kvmppc_clr_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
+       if (ret)
+               pr_warn("kvmppc_clr_passthru_irq (irq %d, gsi %d) fails: %d\n",
+                       prod->irq, irqfd->gsi, ret);
+}
+#endif
+
 static long kvm_arch_vm_ioctl_hv(struct file *filp,
                                 unsigned int ioctl, unsigned long arg)
 {
@@ -3400,6 +3609,10 @@ static struct kvmppc_ops kvm_ops_hv = {
        .fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
        .arch_vm_ioctl  = kvm_arch_vm_ioctl_hv,
        .hcall_implemented = kvmppc_hcall_impl_hv,
+#ifdef CONFIG_KVM_XICS
+       .irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv,
+       .irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv,
+#endif
 };
 
 static int kvm_init_subcore_bitmap(void)
index 5f0380d..0c84d6b 100644 (file)
@@ -25,6 +25,7 @@
 #include <asm/xics.h>
 #include <asm/dbell.h>
 #include <asm/cputhreads.h>
+#include <asm/io.h>
 
 #define KVM_CMA_CHUNK_ORDER    18
 
@@ -286,3 +287,158 @@ void kvmhv_commence_exit(int trap)
 
 struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
 EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv);
+
+#ifdef CONFIG_KVM_XICS
+static struct kvmppc_irq_map *get_irqmap(struct kvmppc_passthru_irqmap *pimap,
+                                        u32 xisr)
+{
+       int i;
+
+       /*
+        * We access the mapped array here without a lock.  That
+        * is safe because we never reduce the number of entries
+        * in the array and we never change the v_hwirq field of
+        * an entry once it is set.
+        *
+        * We have also carefully ordered the stores in the writer
+        * and the loads here in the reader, so that if we find a matching
+        * hwirq here, the associated GSI and irq_desc fields are valid.
+        */
+       for (i = 0; i < pimap->n_mapped; i++)  {
+               if (xisr == pimap->mapped[i].r_hwirq) {
+                       /*
+                        * Order subsequent reads in the caller to serialize
+                        * with the writer.
+                        */
+                       smp_rmb();
+                       return &pimap->mapped[i];
+               }
+       }
+       return NULL;
+}
+
+/*
+ * If we have an interrupt that's not an IPI, check if we have a
+ * passthrough adapter and if so, check if this external interrupt
+ * is for the adapter.
+ * We will attempt to deliver the IRQ directly to the target VCPU's
+ * ICP, the virtual ICP (based on affinity - the xive value in ICS).
+ *
+ * If the delivery fails or if this is not for a passthrough adapter,
+ * return to the host to handle this interrupt. We earlier
+ * saved a copy of the XIRR in the PACA, it will be picked up by
+ * the host ICP driver.
+ */
+static int kvmppc_check_passthru(u32 xisr, __be32 xirr)
+{
+       struct kvmppc_passthru_irqmap *pimap;
+       struct kvmppc_irq_map *irq_map;
+       struct kvm_vcpu *vcpu;
+
+       vcpu = local_paca->kvm_hstate.kvm_vcpu;
+       if (!vcpu)
+               return 1;
+       pimap = kvmppc_get_passthru_irqmap(vcpu->kvm);
+       if (!pimap)
+               return 1;
+       irq_map = get_irqmap(pimap, xisr);
+       if (!irq_map)
+               return 1;
+
+       /* We're handling this interrupt, generic code doesn't need to */
+       local_paca->kvm_hstate.saved_xirr = 0;
+
+       return kvmppc_deliver_irq_passthru(vcpu, xirr, irq_map, pimap);
+}
+
+#else
+static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr)
+{
+       return 1;
+}
+#endif
+
+/*
+ * Determine what sort of external interrupt is pending (if any).
+ * Returns:
+ *     0 if no interrupt is pending
+ *     1 if an interrupt is pending that needs to be handled by the host
+ *     2 Passthrough that needs completion in the host
+ *     -1 if there was a guest wakeup IPI (which has now been cleared)
+ *     -2 if there is PCI passthrough external interrupt that was handled
+ */
+
+long kvmppc_read_intr(void)
+{
+       unsigned long xics_phys;
+       u32 h_xirr;
+       __be32 xirr;
+       u32 xisr;
+       u8 host_ipi;
+
+       /* see if a host IPI is pending */
+       host_ipi = local_paca->kvm_hstate.host_ipi;
+       if (host_ipi)
+               return 1;
+
+       /* Now read the interrupt from the ICP */
+       xics_phys = local_paca->kvm_hstate.xics_phys;
+       if (unlikely(!xics_phys))
+               return 1;
+
+       /*
+        * Save XIRR for later. Since we get control in reverse endian
+        * on LE systems, save it byte reversed and fetch it back in
+        * host endian. Note that xirr is the value read from the
+        * XIRR register, while h_xirr is the host endian version.
+        */
+       xirr = _lwzcix(xics_phys + XICS_XIRR);
+       h_xirr = be32_to_cpu(xirr);
+       local_paca->kvm_hstate.saved_xirr = h_xirr;
+       xisr = h_xirr & 0xffffff;
+       /*
+        * Ensure that the store/load complete to guarantee all side
+        * effects of loading from XIRR has completed
+        */
+       smp_mb();
+
+       /* if nothing pending in the ICP */
+       if (!xisr)
+               return 0;
+
+       /* We found something in the ICP...
+        *
+        * If it is an IPI, clear the MFRR and EOI it.
+        */
+       if (xisr == XICS_IPI) {
+               _stbcix(xics_phys + XICS_MFRR, 0xff);
+               _stwcix(xics_phys + XICS_XIRR, xirr);
+               /*
+                * Need to ensure side effects of above stores
+                * complete before proceeding.
+                */
+               smp_mb();
+
+               /*
+                * We need to re-check host IPI now in case it got set in the
+                * meantime. If it's clear, we bounce the interrupt to the
+                * guest
+                */
+               host_ipi = local_paca->kvm_hstate.host_ipi;
+               if (unlikely(host_ipi != 0)) {
+                       /* We raced with the host,
+                        * we need to resend that IPI, bummer
+                        */
+                       _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
+                       /* Let side effects complete */
+                       smp_mb();
+                       return 1;
+               }
+
+               /* OK, it's an IPI for us */
+               local_paca->kvm_hstate.saved_xirr = 0;
+               return -1;
+       }
+
+       return kvmppc_check_passthru(xisr, xirr);
+}
index 980d8a6..82ff5de 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/kvm_host.h>
 #include <linux/err.h>
+#include <linux/kernel_stat.h>
 
 #include <asm/kvm_book3s.h>
 #include <asm/kvm_ppc.h>
 #include <asm/debug.h>
 #include <asm/synch.h>
 #include <asm/cputhreads.h>
+#include <asm/pgtable.h>
 #include <asm/ppc-opcode.h>
+#include <asm/pnv-pci.h>
+#include <asm/opal.h>
 
 #include "book3s_xics.h"
 
 
 int h_ipi_redirect = 1;
 EXPORT_SYMBOL(h_ipi_redirect);
+int kvm_irq_bypass = 1;
+EXPORT_SYMBOL(kvm_irq_bypass);
 
 static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
                            u32 new_irq);
+static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu);
 
 /* -- ICS routines -- */
 static void ics_rm_check_resend(struct kvmppc_xics *xics,
@@ -708,10 +715,123 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
                icp->rm_action |= XICS_RM_NOTIFY_EOI;
                icp->rm_eoied_irq = irq;
        }
+
+       if (state->host_irq) {
+               ++vcpu->stat.pthru_all;
+               if (state->intr_cpu != -1) {
+                       int pcpu = raw_smp_processor_id();
+
+                       pcpu = cpu_first_thread_sibling(pcpu);
+                       ++vcpu->stat.pthru_host;
+                       if (state->intr_cpu != pcpu) {
+                               ++vcpu->stat.pthru_bad_aff;
+                               xics_opal_rm_set_server(state->host_irq, pcpu);
+                       }
+                       state->intr_cpu = -1;
+               }
+       }
  bail:
        return check_too_hard(xics, icp);
 }
 
+unsigned long eoi_rc;
+
+static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr)
+{
+       unsigned long xics_phys;
+       int64_t rc;
+
+       rc = pnv_opal_pci_msi_eoi(c, hwirq);
+
+       if (rc)
+               eoi_rc = rc;
+
+       iosync();
+
+       /* EOI it */
+       xics_phys = local_paca->kvm_hstate.xics_phys;
+       _stwcix(xics_phys + XICS_XIRR, xirr);
+}
+
+static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu)
+{
+       unsigned int mangle_cpu = get_hard_smp_processor_id(server_cpu) << 2;
+
+       return opal_rm_set_xive(hw_irq, mangle_cpu, DEFAULT_PRIORITY);
+}
+
+/*
+ * Increment a per-CPU 32-bit unsigned integer variable.
+ * Safe to call in real-mode. Handles vmalloc'ed addresses
+ *
+ * ToDo: Make this work for any integral type
+ */
+
+static inline void this_cpu_inc_rm(unsigned int __percpu *addr)
+{
+       unsigned long l;
+       unsigned int *raddr;
+       int cpu = smp_processor_id();
+
+       raddr = per_cpu_ptr(addr, cpu);
+       l = (unsigned long)raddr;
+
+       if (REGION_ID(l) == VMALLOC_REGION_ID) {
+               l = vmalloc_to_phys(raddr);
+               raddr = (unsigned int *)l;
+       }
+       ++*raddr;
+}
+
+/*
+ * We don't try to update the flags in the irq_desc 'istate' field in
+ * here as would happen in the normal IRQ handling path for several reasons:
+ *  - state flags represent internal IRQ state and are not expected to be
+ *    updated outside the IRQ subsystem
+ *  - more importantly, these are useful for edge triggered interrupts,
+ *    IRQ probing, etc., but we are only handling MSI/MSIx interrupts here
+ *    and these states shouldn't apply to us.
+ *
+ * However, we do update irq_stats - we somewhat duplicate the code in
+ * kstat_incr_irqs_this_cpu() for this since this function is defined
+ * in irq/internal.h which we don't want to include here.
+ * The only difference is that desc->kstat_irqs is an allocated per CPU
+ * variable and could have been vmalloc'ed, so we can't directly
+ * call __this_cpu_inc() on it. The kstat structure is a static
+ * per CPU variable and it should be accessible by real-mode KVM.
+ *
+ */
+static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc)
+{
+       this_cpu_inc_rm(desc->kstat_irqs);
+       __this_cpu_inc(kstat.irqs_sum);
+}
+
+long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
+                                u32 xirr,
+                                struct kvmppc_irq_map *irq_map,
+                                struct kvmppc_passthru_irqmap *pimap)
+{
+       struct kvmppc_xics *xics;
+       struct kvmppc_icp *icp;
+       u32 irq;
+
+       irq = irq_map->v_hwirq;
+       xics = vcpu->kvm->arch.xics;
+       icp = vcpu->arch.icp;
+
+       kvmppc_rm_handle_irq_desc(irq_map->desc);
+       icp_rm_deliver_irq(xics, icp, irq);
+
+       /* EOI the interrupt */
+       icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr);
+
+       if (check_too_hard(xics, icp) == H_TOO_HARD)
+               return 2;
+       else
+               return -2;
+}
+
 /*  --- Non-real mode XICS-related built-in routines ---  */
 
 /**
index 9756555..c3c1d1b 100644 (file)
@@ -221,6 +221,13 @@ kvmppc_primary_no_guest:
        li      r3, 0           /* Don't wake on privileged (OS) doorbell */
        b       kvm_do_nap
 
+/*
+ * kvm_novcpu_wakeup
+ *     Entered from kvm_start_guest if kvm_hstate.napping is set
+ *     to NAPPING_NOVCPU
+ *             r2 = kernel TOC
+ *             r13 = paca
+ */
 kvm_novcpu_wakeup:
        ld      r1, HSTATE_HOST_R1(r13)
        ld      r5, HSTATE_KVM_VCORE(r13)
@@ -230,6 +237,13 @@ kvm_novcpu_wakeup:
        /* check the wake reason */
        bl      kvmppc_check_wake_reason
 
+       /*
+        * Restore volatile registers since we could have called
+        * a C routine in kvmppc_check_wake_reason.
+        *      r5 = VCORE
+        */
+       ld      r5, HSTATE_KVM_VCORE(r13)
+
        /* see if any other thread is already exiting */
        lwz     r0, VCORE_ENTRY_EXIT(r5)
        cmpwi   r0, 0x100
@@ -322,6 +336,11 @@ kvm_start_guest:
 
        /* Check the wake reason in SRR1 to see why we got here */
        bl      kvmppc_check_wake_reason
+       /*
+        * kvmppc_check_wake_reason could invoke a C routine, but we
+        * have no volatile registers to restore when we return.
+        */
+
        cmpdi   r3, 0
        bge     kvm_no_guest
 
@@ -625,9 +644,11 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S)
 38:
 
 BEGIN_FTR_SECTION
-       /* DPDES is shared between threads */
+       /* DPDES and VTB are shared between threads */
        ld      r8, VCORE_DPDES(r5)
+       ld      r7, VCORE_VTB(r5)
        mtspr   SPRN_DPDES, r8
+       mtspr   SPRN_VTB, r7
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
        /* Mark the subcore state as inside guest */
@@ -787,10 +808,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
        mtspr   SPRN_CIABR, r7
        mtspr   SPRN_TAR, r8
        ld      r5, VCPU_IC(r4)
-       ld      r6, VCPU_VTB(r4)
-       mtspr   SPRN_IC, r5
-       mtspr   SPRN_VTB, r6
        ld      r8, VCPU_EBBHR(r4)
+       mtspr   SPRN_IC, r5
        mtspr   SPRN_EBBHR, r8
        ld      r5, VCPU_EBBRR(r4)
        ld      r6, VCPU_BESCR(r4)
@@ -881,6 +900,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
        cmpwi   r3, 512         /* 1 microsecond */
        blt     hdec_soon
 
+deliver_guest_interrupt:
        ld      r6, VCPU_CTR(r4)
        ld      r7, VCPU_XER(r4)
 
@@ -895,7 +915,6 @@ kvmppc_cede_reentry:                /* r4 = vcpu, r13 = paca */
        mtspr   SPRN_SRR0, r6
        mtspr   SPRN_SRR1, r7
 
-deliver_guest_interrupt:
        /* r11 = vcpu->arch.msr & ~MSR_HV */
        rldicl  r11, r11, 63 - MSR_HV_LG, 1
        rotldi  r11, r11, 1 + MSR_HV_LG
@@ -1155,10 +1174,54 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
         * set, we know the host wants us out so let's do it now
         */
        bl      kvmppc_read_intr
+
+       /*
+        * Restore the active volatile registers after returning from
+        * a C function.
+        */
+       ld      r9, HSTATE_KVM_VCPU(r13)
+       li      r12, BOOK3S_INTERRUPT_EXTERNAL
+
+       /*
+        * kvmppc_read_intr return codes:
+        *
+        * Exit to host (r3 > 0)
+        *   1 An interrupt is pending that needs to be handled by the host
+        *     Exit guest and return to host by branching to guest_exit_cont
+        *
+        *   2 Passthrough that needs completion in the host
+        *     Exit guest and return to host by branching to guest_exit_cont
+        *     However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
+        *     to indicate to the host to complete handling the interrupt
+        *
+        * Before returning to guest, we check if any CPU is heading out
+        * to the host and if so, we head out also. If no CPUs are heading
+        * check return values <= 0.
+        *
+        * Return to guest (r3 <= 0)
+        *  0 No external interrupt is pending
+        * -1 A guest wakeup IPI (which has now been cleared)
+        *    In either case, we return to guest to deliver any pending
+        *    guest interrupts.
+        *
+        * -2 A PCI passthrough external interrupt was handled
+        *    (interrupt was delivered directly to guest)
+        *    Return to guest to deliver any pending guest interrupts.
+        */
+
+       cmpdi   r3, 1
+       ble     1f
+
+       /* Return code = 2 */
+       li      r12, BOOK3S_INTERRUPT_HV_RM_HARD
+       stw     r12, VCPU_TRAP(r9)
+       b       guest_exit_cont
+
+1:     /* Return code <= 1 */
        cmpdi   r3, 0
        bgt     guest_exit_cont
 
-       /* Check if any CPU is heading out to the host, if so head out too */
+       /* Return code <= 0 */
 4:     ld      r5, HSTATE_KVM_VCORE(r13)
        lwz     r0, VCORE_ENTRY_EXIT(r5)
        cmpwi   r0, 0x100
@@ -1271,10 +1334,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
        stw     r6, VCPU_PSPB(r9)
        std     r7, VCPU_FSCR(r9)
        mfspr   r5, SPRN_IC
-       mfspr   r6, SPRN_VTB
        mfspr   r7, SPRN_TAR
        std     r5, VCPU_IC(r9)
-       std     r6, VCPU_VTB(r9)
        std     r7, VCPU_TAR(r9)
        mfspr   r8, SPRN_EBBHR
        std     r8, VCPU_EBBHR(r9)
@@ -1501,9 +1562,11 @@ kvmhv_switch_to_host:
        isync
 
 BEGIN_FTR_SECTION
-       /* DPDES is shared between threads */
+       /* DPDES and VTB are shared between threads */
        mfspr   r7, SPRN_DPDES
+       mfspr   r8, SPRN_VTB
        std     r7, VCORE_DPDES(r5)
+       std     r8, VCORE_VTB(r5)
        /* clear DPDES so we don't get guest doorbells in the host */
        li      r8, 0
        mtspr   SPRN_DPDES, r8
@@ -2213,10 +2276,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
        ld      r29, VCPU_GPR(R29)(r4)
        ld      r30, VCPU_GPR(R30)(r4)
        ld      r31, VCPU_GPR(R31)(r4)
+
        /* Check the wake reason in SRR1 to see why we got here */
        bl      kvmppc_check_wake_reason
 
+       /*
+        * Restore volatile registers since we could have called a
+        * C routine in kvmppc_check_wake_reason
+        *      r4 = VCPU
+        * r3 tells us whether we need to return to host or not
+        * WARNING: it gets checked further down:
+        * should not modify r3 until this check is done.
+        */
+       ld      r4, HSTATE_KVM_VCPU(r13)
+
        /* clear our bit in vcore->napping_threads */
 34:    ld      r5,HSTATE_KVM_VCORE(r13)
        lbz     r7,HSTATE_PTID(r13)
@@ -2230,7 +2303,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
        li      r0,0
        stb     r0,HSTATE_NAPPING(r13)
 
-       /* See if the wake reason means we need to exit */
+       /* See if the wake reason saved in r3 means we need to exit */
        stw     r12, VCPU_TRAP(r4)
        mr      r9, r4
        cmpdi   r3, 0
@@ -2297,10 +2370,14 @@ machine_check_realmode:
  *     0 if nothing needs to be done
  *     1 if something happened that needs to be handled by the host
  *     -1 if there was a guest wakeup (IPI or msgsnd)
+ *     -2 if we handled a PCI passthrough interrupt (returned by
+ *             kvmppc_read_intr only)
  *
  * Also sets r12 to the interrupt vector for any interrupt that needs
  * to be handled now by the host (0x500 for external interrupt), or zero.
- * Modifies r0, r6, r7, r8.
+ * Modifies all volatile registers (since it may call a C function).
+ * This routine calls kvmppc_read_intr, a C function, if an external
+ * interrupt is pending.
  */
 kvmppc_check_wake_reason:
        mfspr   r6, SPRN_SRR1
@@ -2310,8 +2387,7 @@ FTR_SECTION_ELSE
        rlwinm  r6, r6, 45-31, 0xe      /* P7 wake reason field is 3 bits */
 ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S)
        cmpwi   r6, 8                   /* was it an external interrupt? */
-       li      r12, BOOK3S_INTERRUPT_EXTERNAL
-       beq     kvmppc_read_intr        /* if so, see what it was */
+       beq     7f                      /* if so, see what it was */
        li      r3, 0
        li      r12, 0
        cmpwi   r6, 6                   /* was it the decrementer? */
@@ -2350,83 +2426,28 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        li      r3, 1
        blr
 
-/*
- * Determine what sort of external interrupt is pending (if any).
- * Returns:
- *     0 if no interrupt is pending
- *     1 if an interrupt is pending that needs to be handled by the host
- *     -1 if there was a guest wakeup IPI (which has now been cleared)
- * Modifies r0, r6, r7, r8, returns value in r3.
- */
-kvmppc_read_intr:
-       /* see if a host IPI is pending */
-       li      r3, 1
-       lbz     r0, HSTATE_HOST_IPI(r13)
-       cmpwi   r0, 0
-       bne     1f
+       /* external interrupt - create a stack frame so we can call C */
+7:     mflr    r0
+       std     r0, PPC_LR_STKOFF(r1)
+       stdu    r1, -PPC_MIN_STKFRM(r1)
+       bl      kvmppc_read_intr
+       nop
+       li      r12, BOOK3S_INTERRUPT_EXTERNAL
+       cmpdi   r3, 1
+       ble     1f
 
-       /* Now read the interrupt from the ICP */
-       ld      r6, HSTATE_XICS_PHYS(r13)
-       li      r7, XICS_XIRR
-       cmpdi   r6, 0
-       beq-    1f
-       lwzcix  r0, r6, r7
        /*
-        * Save XIRR for later. Since we get in in reverse endian on LE
-        * systems, save it byte reversed and fetch it back in host endian.
-        */
-       li      r3, HSTATE_SAVED_XIRR
-       STWX_BE r0, r3, r13
-#ifdef __LITTLE_ENDIAN__
-       lwz     r3, HSTATE_SAVED_XIRR(r13)
-#else
-       mr      r3, r0
-#endif
-       rlwinm. r3, r3, 0, 0xffffff
-       sync
-       beq     1f                      /* if nothing pending in the ICP */
-
-       /* We found something in the ICP...
-        *
-        * If it's not an IPI, stash it in the PACA and return to
-        * the host, we don't (yet) handle directing real external
-        * interrupts directly to the guest
+        * Return code of 2 means PCI passthrough interrupt, but
+        * we need to return back to host to complete handling the
+        * interrupt. Trap reason is expected in r12 by guest
+        * exit code.
         */
-       cmpwi   r3, XICS_IPI            /* if there is, is it an IPI? */
-       bne     42f
-
-       /* It's an IPI, clear the MFRR and EOI it */
-       li      r3, 0xff
-       li      r8, XICS_MFRR
-       stbcix  r3, r6, r8              /* clear the IPI */
-       stwcix  r0, r6, r7              /* EOI it */
-       sync
-
-       /* We need to re-check host IPI now in case it got set in the
-        * meantime. If it's clear, we bounce the interrupt to the
-        * guest
-        */
-       lbz     r0, HSTATE_HOST_IPI(r13)
-       cmpwi   r0, 0
-       bne-    43f
-
-       /* OK, it's an IPI for us */
-       li      r12, 0
-       li      r3, -1
-1:     blr
-
-42:    /* It's not an IPI and it's for the host. We saved a copy of XIRR in
-        * the PACA earlier, it will be picked up by the host ICP driver
-        */
-       li      r3, 1
-       b       1b
-
-43:    /* We raced with the host, we need to resend that IPI, bummer */
-       li      r0, IPI_PRIORITY
-       stbcix  r0, r6, r8              /* set the IPI */
-       sync
-       li      r3, 1
-       b       1b
+       li      r12, BOOK3S_INTERRUPT_HV_RM_HARD
+1:
+       ld      r0, PPC_MIN_STKFRM+PPC_LR_STKOFF(r1)
+       addi    r1, r1, PPC_MIN_STKFRM
+       mtlr    r0
+       blr
 
 /*
  * Save away FP, VMX and VSX registers.
index e76f79a..826c541 100644 (file)
@@ -226,7 +226,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu,
         */
        vcpu->arch.purr += get_tb() - vcpu->arch.entry_tb;
        vcpu->arch.spurr += get_tb() - vcpu->arch.entry_tb;
-       vcpu->arch.vtb += get_vtb() - vcpu->arch.entry_vtb;
+       to_book3s(vcpu)->vtb += get_vtb() - vcpu->arch.entry_vtb;
        if (cpu_has_feature(CPU_FTR_ARCH_207S))
                vcpu->arch.ic += mfspr(SPRN_IC) - vcpu->arch.entry_ic;
        svcpu->in_use = false;
@@ -448,6 +448,8 @@ void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr)
        case PVR_POWER7:
        case PVR_POWER7p:
        case PVR_POWER8:
+       case PVR_POWER8E:
+       case PVR_POWER8NVL:
                vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE |
                        BOOK3S_HFLAG_NEW_TLBIE;
                break;
@@ -1361,6 +1363,9 @@ static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
        case KVM_REG_PPC_HIOR:
                *val = get_reg_val(id, to_book3s(vcpu)->hior);
                break;
+       case KVM_REG_PPC_VTB:
+               *val = get_reg_val(id, to_book3s(vcpu)->vtb);
+               break;
        case KVM_REG_PPC_LPCR:
        case KVM_REG_PPC_LPCR_64:
                /*
@@ -1397,6 +1402,9 @@ static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
                to_book3s(vcpu)->hior = set_reg_val(id, *val);
                to_book3s(vcpu)->hior_explicit = true;
                break;
+       case KVM_REG_PPC_VTB:
+               to_book3s(vcpu)->vtb = set_reg_val(id, *val);
+               break;
        case KVM_REG_PPC_LPCR:
        case KVM_REG_PPC_LPCR_64:
                kvmppc_set_lpcr_pr(vcpu, set_reg_val(id, *val));
index 05aa113..3bdc639 100644 (file)
@@ -99,6 +99,10 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
                return 0;
        }
 
+       /* Record which CPU this arrived on for passed-through interrupts */
+       if (state->host_irq)
+               state->intr_cpu = raw_smp_processor_id();
+
        /* Attempt delivery */
        icp_deliver_irq(xics, NULL, irq);
 
@@ -812,7 +816,7 @@ static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
        return H_SUCCESS;
 }
 
-static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
+int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
 {
        struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
        struct kvmppc_icp *icp = vcpu->arch.icp;
@@ -841,6 +845,7 @@ static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
 
        return H_SUCCESS;
 }
+EXPORT_SYMBOL_GPL(kvmppc_xics_rm_complete);
 
 int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
 {
@@ -892,6 +897,21 @@ EXPORT_SYMBOL_GPL(kvmppc_xics_hcall);
 
 /* -- Initialisation code etc. -- */
 
+static void xics_debugfs_irqmap(struct seq_file *m,
+                               struct kvmppc_passthru_irqmap *pimap)
+{
+       int i;
+
+       if (!pimap)
+               return;
+       seq_printf(m, "========\nPIRQ mappings: %d maps\n===========\n",
+                               pimap->n_mapped);
+       for (i = 0; i < pimap->n_mapped; i++)  {
+               seq_printf(m, "r_hwirq=%x, v_hwirq=%x\n",
+                       pimap->mapped[i].r_hwirq, pimap->mapped[i].v_hwirq);
+       }
+}
+
 static int xics_debug_show(struct seq_file *m, void *private)
 {
        struct kvmppc_xics *xics = m->private;
@@ -913,6 +933,8 @@ static int xics_debug_show(struct seq_file *m, void *private)
        t_check_resend = 0;
        t_reject = 0;
 
+       xics_debugfs_irqmap(m, kvm->arch.pimap);
+
        seq_printf(m, "=========\nICP state\n=========\n");
 
        kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -1252,6 +1274,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
 {
        struct kvmppc_xics *xics = kvm->arch.xics;
 
+       if (!xics)
+               return -ENODEV;
        return ics_deliver_irq(xics, irq, level);
 }
 
@@ -1418,3 +1442,34 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
 {
        return pin;
 }
+
+void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long irq,
+                           unsigned long host_irq)
+{
+       struct kvmppc_xics *xics = kvm->arch.xics;
+       struct kvmppc_ics *ics;
+       u16 idx;
+
+       ics = kvmppc_xics_find_ics(xics, irq, &idx);
+       if (!ics)
+               return;
+
+       ics->irq_state[idx].host_irq = host_irq;
+       ics->irq_state[idx].intr_cpu = -1;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xics_set_mapped);
+
+void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long irq,
+                           unsigned long host_irq)
+{
+       struct kvmppc_xics *xics = kvm->arch.xics;
+       struct kvmppc_ics *ics;
+       u16 idx;
+
+       ics = kvmppc_xics_find_ics(xics, irq, &idx);
+       if (!ics)
+               return;
+
+       ics->irq_state[idx].host_irq = 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xics_clr_mapped);
index a46b954..2a50320 100644 (file)
@@ -42,6 +42,8 @@ struct ics_irq_state {
        u8  lsi;                /* level-sensitive interrupt */
        u8  asserted; /* Only for LSI */
        u8  exists;
+       int intr_cpu;
+       u32 host_irq;
 };
 
 /* Atomic ICP state, updated with a single compare & swap */
index 02b4672..df3f270 100644 (file)
@@ -2038,7 +2038,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
                if (type == KVMPPC_DEBUG_NONE)
                        continue;
 
-               if (type & !(KVMPPC_DEBUG_WATCH_READ |
+               if (type & ~(KVMPPC_DEBUG_WATCH_READ |
                             KVMPPC_DEBUG_WATCH_WRITE |
                             KVMPPC_DEBUG_BREAKPOINT))
                        return -EINVAL;
index 29911a0..ddbf8f0 100644 (file)
@@ -743,7 +743,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
        char *virt;
        struct page **pages;
        struct tlbe_priv *privs[2] = {};
-       u64 *g2h_bitmap = NULL;
+       u64 *g2h_bitmap;
        size_t array_len;
        u32 sets;
        int num_pages, ret, i;
@@ -779,41 +779,44 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 
        num_pages = DIV_ROUND_UP(cfg->array + array_len - 1, PAGE_SIZE) -
                    cfg->array / PAGE_SIZE;
-       pages = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL);
+       pages = kmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL);
        if (!pages)
                return -ENOMEM;
 
        ret = get_user_pages_fast(cfg->array, num_pages, 1, pages);
        if (ret < 0)
-               goto err_pages;
+               goto free_pages;
 
        if (ret != num_pages) {
                num_pages = ret;
                ret = -EFAULT;
-               goto err_put_page;
+               goto put_pages;
        }
 
        virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL);
        if (!virt) {
                ret = -ENOMEM;
-               goto err_put_page;
+               goto put_pages;
        }
 
-       privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0],
-                          GFP_KERNEL);
-       privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1],
-                          GFP_KERNEL);
+       privs[0] = kcalloc(params.tlb_sizes[0], sizeof(*privs[0]), GFP_KERNEL);
+       if (!privs[0]) {
+               ret = -ENOMEM;
+               goto put_pages;
+       }
 
-       if (!privs[0] || !privs[1]) {
+       privs[1] = kcalloc(params.tlb_sizes[1], sizeof(*privs[1]), GFP_KERNEL);
+       if (!privs[1]) {
                ret = -ENOMEM;
-               goto err_privs;
+               goto free_privs_first;
        }
 
-       g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1],
-                            GFP_KERNEL);
+       g2h_bitmap = kcalloc(params.tlb_sizes[1],
+                            sizeof(*g2h_bitmap),
+                            GFP_KERNEL);
        if (!g2h_bitmap) {
                ret = -ENOMEM;
-               goto err_privs;
+               goto free_privs_second;
        }
 
        free_gtlb(vcpu_e500);
@@ -845,16 +848,14 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 
        kvmppc_recalc_tlb1map_range(vcpu_e500);
        return 0;
-
-err_privs:
-       kfree(privs[0]);
+ free_privs_second:
        kfree(privs[1]);
-
-err_put_page:
+ free_privs_first:
+       kfree(privs[0]);
+ put_pages:
        for (i = 0; i < num_pages; i++)
                put_page(pages[i]);
-
-err_pages:
+ free_pages:
        kfree(pages);
        return ret;
 }
@@ -904,11 +905,9 @@ static int vcpu_mmu_init(struct kvm_vcpu *vcpu,
 int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
        struct kvm_vcpu *vcpu = &vcpu_e500->vcpu;
-       int entry_size = sizeof(struct kvm_book3e_206_tlb_entry);
-       int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE;
 
        if (e500_mmu_host_init(vcpu_e500))
-               goto err;
+               goto free_vcpu;
 
        vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE;
        vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE;
@@ -920,37 +919,39 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
        vcpu_e500->gtlb_params[1].ways = KVM_E500_TLB1_SIZE;
        vcpu_e500->gtlb_params[1].sets = 1;
 
-       vcpu_e500->gtlb_arch = kmalloc(entries * entry_size, GFP_KERNEL);
+       vcpu_e500->gtlb_arch = kmalloc_array(KVM_E500_TLB0_SIZE +
+                                            KVM_E500_TLB1_SIZE,
+                                            sizeof(*vcpu_e500->gtlb_arch),
+                                            GFP_KERNEL);
        if (!vcpu_e500->gtlb_arch)
                return -ENOMEM;
 
        vcpu_e500->gtlb_offset[0] = 0;
        vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE;
 
-       vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) *
-                                         vcpu_e500->gtlb_params[0].entries,
+       vcpu_e500->gtlb_priv[0] = kcalloc(vcpu_e500->gtlb_params[0].entries,
+                                         sizeof(struct tlbe_ref),
                                          GFP_KERNEL);
        if (!vcpu_e500->gtlb_priv[0])
-               goto err;
+               goto free_vcpu;
 
-       vcpu_e500->gtlb_priv[1] = kzalloc(sizeof(struct tlbe_ref) *
-                                         vcpu_e500->gtlb_params[1].entries,
+       vcpu_e500->gtlb_priv[1] = kcalloc(vcpu_e500->gtlb_params[1].entries,
+                                         sizeof(struct tlbe_ref),
                                          GFP_KERNEL);
        if (!vcpu_e500->gtlb_priv[1])
-               goto err;
+               goto free_vcpu;
 
-       vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(u64) *
-                                         vcpu_e500->gtlb_params[1].entries,
+       vcpu_e500->g2h_tlb1_map = kcalloc(vcpu_e500->gtlb_params[1].entries,
+                                         sizeof(*vcpu_e500->g2h_tlb1_map),
                                          GFP_KERNEL);
        if (!vcpu_e500->g2h_tlb1_map)
-               goto err;
+               goto free_vcpu;
 
        vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params);
 
        kvmppc_recalc_tlb1map_range(vcpu_e500);
        return 0;
-
-err:
+ free_vcpu:
        free_gtlb(vcpu_e500);
        return -1;
 }
index 6ce40dd..70963c8 100644 (file)
@@ -27,6 +27,8 @@
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/module.h>
+#include <linux/irqbypass.h>
+#include <linux/kvm_irqfd.h>
 #include <asm/cputable.h>
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
@@ -436,6 +438,16 @@ err_out:
        return -EINVAL;
 }
 
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+       return false;
+}
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
        unsigned int i;
@@ -739,6 +751,42 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 #endif
 }
 
+/*
+ * irq_bypass_add_producer and irq_bypass_del_producer are only
+ * useful if the architecture supports PCI passthrough.
+ * irq_bypass_stop and irq_bypass_start are not needed and so
+ * kvm_ops are not defined for them.
+ */
+bool kvm_arch_has_irq_bypass(void)
+{
+       return ((kvmppc_hv_ops && kvmppc_hv_ops->irq_bypass_add_producer) ||
+               (kvmppc_pr_ops && kvmppc_pr_ops->irq_bypass_add_producer));
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+                                    struct irq_bypass_producer *prod)
+{
+       struct kvm_kernel_irqfd *irqfd =
+               container_of(cons, struct kvm_kernel_irqfd, consumer);
+       struct kvm *kvm = irqfd->kvm;
+
+       if (kvm->arch.kvm_ops->irq_bypass_add_producer)
+               return kvm->arch.kvm_ops->irq_bypass_add_producer(cons, prod);
+
+       return 0;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+                                     struct irq_bypass_producer *prod)
+{
+       struct kvm_kernel_irqfd *irqfd =
+               container_of(cons, struct kvm_kernel_irqfd, consumer);
+       struct kvm *kvm = irqfd->kvm;
+
+       if (kvm->arch.kvm_ops->irq_bypass_del_producer)
+               kvm->arch.kvm_ops->irq_bypass_del_producer(cons, prod);
+}
+
 static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
                                       struct kvm_run *run)
 {
@@ -1167,6 +1215,19 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
        return r;
 }
 
+bool kvm_arch_intc_initialized(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_MPIC
+       if (kvm->arch.mpic)
+               return true;
+#endif
+#ifdef CONFIG_KVM_XICS
+       if (kvm->arch.xics)
+               return true;
+#endif
+       return false;
+}
+
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                     struct kvm_mp_state *mp_state)
 {
index 33d9daf..fb21990 100644 (file)
@@ -432,6 +432,28 @@ TRACE_EVENT(kvmppc_vcore_blocked,
                   __entry->runner_vcpu, __entry->n_runnable, __entry->tgid)
 );
 
+TRACE_EVENT(kvmppc_vcore_wakeup,
+       TP_PROTO(int do_sleep, __u64 ns),
+
+       TP_ARGS(do_sleep, ns),
+
+       TP_STRUCT__entry(
+               __field(__u64,  ns)
+               __field(int,    waited)
+               __field(pid_t,  tgid)
+       ),
+
+       TP_fast_assign(
+               __entry->ns     = ns;
+               __entry->waited = do_sleep;
+               __entry->tgid   = current->tgid;
+       ),
+
+       TP_printk("%s time %lld ns, tgid=%d",
+               __entry->waited ? "wait" : "poll",
+               __entry->ns, __entry->tgid)
+);
+
 TRACE_EVENT(kvmppc_run_vcpu_enter,
        TP_PROTO(struct kvm_vcpu *vcpu),
 
index 0e4e965..83ddc0e 100644 (file)
@@ -493,36 +493,6 @@ static void native_hugepage_invalidate(unsigned long vsid,
 }
 #endif
 
-static inline int __hpte_actual_psize(unsigned int lp, int psize)
-{
-       int i, shift;
-       unsigned int mask;
-
-       /* start from 1 ignoring MMU_PAGE_4K */
-       for (i = 1; i < MMU_PAGE_COUNT; i++) {
-
-               /* invalid penc */
-               if (mmu_psize_defs[psize].penc[i] == -1)
-                       continue;
-               /*
-                * encoding bits per actual page size
-                *        PTE LP     actual page size
-                *    rrrr rrrz         >=8KB
-                *    rrrr rrzz         >=16KB
-                *    rrrr rzzz         >=32KB
-                *    rrrr zzzz         >=64KB
-                * .......
-                */
-               shift = mmu_psize_defs[i].shift - LP_SHIFT;
-               if (shift > LP_BITS)
-                       shift = LP_BITS;
-               mask = (1 << shift) - 1;
-               if ((lp & mask) == mmu_psize_defs[psize].penc[i])
-                       return i;
-       }
-       return -1;
-}
-
 static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
                        int *psize, int *apsize, int *ssize, unsigned long *vpn)
 {
@@ -538,16 +508,8 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
                size   = MMU_PAGE_4K;
                a_size = MMU_PAGE_4K;
        } else {
-               for (size = 0; size < MMU_PAGE_COUNT; size++) {
-
-                       /* valid entries have a shift value */
-                       if (!mmu_psize_defs[size].shift)
-                               continue;
-
-                       a_size = __hpte_actual_psize(lp, size);
-                       if (a_size != -1)
-                               break;
-               }
+               size = hpte_page_sizes[lp] & 0xf;
+               a_size = hpte_page_sizes[lp] >> 4;
        }
        /* This works for all page sizes, and for 256M and 1T segments */
        if (cpu_has_feature(CPU_FTR_ARCH_300))
index 0821556..ef3ae89 100644 (file)
@@ -93,6 +93,9 @@ static unsigned long _SDR1;
 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
 EXPORT_SYMBOL_GPL(mmu_psize_defs);
 
+u8 hpte_page_sizes[1 << LP_BITS];
+EXPORT_SYMBOL_GPL(hpte_page_sizes);
+
 struct hash_pte *htab_address;
 unsigned long htab_size_bytes;
 unsigned long htab_hash_mask;
@@ -564,8 +567,60 @@ static void __init htab_scan_page_sizes(void)
 #endif /* CONFIG_HUGETLB_PAGE */
 }
 
+/*
+ * Fill in the hpte_page_sizes[] array.
+ * We go through the mmu_psize_defs[] array looking for all the
+ * supported base/actual page size combinations.  Each combination
+ * has a unique pagesize encoding (penc) value in the low bits of
+ * the LP field of the HPTE.  For actual page sizes less than 1MB,
+ * some of the upper LP bits are used for RPN bits, meaning that
+ * we need to fill in several entries in hpte_page_sizes[].
+ *
+ * In diagrammatic form, with r = RPN bits and z = page size bits:
+ *        PTE LP     actual page size
+ *    rrrr rrrz                >=8KB
+ *    rrrr rrzz                >=16KB
+ *    rrrr rzzz                >=32KB
+ *    rrrr zzzz                >=64KB
+ *    ...
+ *
+ * The zzzz bits are implementation-specific but are chosen so that
+ * no encoding for a larger page size uses the same value in its
+ * low-order N bits as the encoding for the 2^(12+N) byte page size
+ * (if it exists).
+ */
+static void init_hpte_page_sizes(void)
+{
+       long int ap, bp;
+       long int shift, penc;
+
+       for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) {
+               if (!mmu_psize_defs[bp].shift)
+                       continue;       /* not a supported page size */
+               for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) {
+                       penc = mmu_psize_defs[bp].penc[ap];
+                       if (penc == -1)
+                               continue;
+                       shift = mmu_psize_defs[ap].shift - LP_SHIFT;
+                       if (shift <= 0)
+                               continue;       /* should never happen */
+                       /*
+                        * For page sizes less than 1MB, this loop
+                        * replicates the entry for all possible values
+                        * of the rrrr bits.
+                        */
+                       while (penc < (1 << LP_BITS)) {
+                               hpte_page_sizes[penc] = (ap << 4) | bp;
+                               penc += 1 << shift;
+                       }
+               }
+       }
+}
+
 static void __init htab_init_page_sizes(void)
 {
+       init_hpte_page_sizes();
+
        if (!debug_pagealloc_enabled()) {
                /*
                 * Pick a size for the linear mapping. Currently, we only
index 3d29d40..44d2d84 100644 (file)
@@ -208,6 +208,7 @@ OPAL_CALL(opal_pci_config_write_byte,               OPAL_PCI_CONFIG_WRITE_BYTE);
 OPAL_CALL(opal_pci_config_write_half_word,     OPAL_PCI_CONFIG_WRITE_HALF_WORD);
 OPAL_CALL(opal_pci_config_write_word,          OPAL_PCI_CONFIG_WRITE_WORD);
 OPAL_CALL(opal_set_xive,                       OPAL_SET_XIVE);
+OPAL_CALL_REAL(opal_rm_set_xive,               OPAL_SET_XIVE);
 OPAL_CALL(opal_get_xive,                       OPAL_GET_XIVE);
 OPAL_CALL(opal_register_exception_handler,     OPAL_REGISTER_OPAL_EXCEPTION_HANDLER);
 OPAL_CALL(opal_pci_eeh_freeze_status,          OPAL_PCI_EEH_FREEZE_STATUS);
index 38a5c65..d314ecc 100644 (file)
@@ -2718,15 +2718,21 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 }
 
 #ifdef CONFIG_PCI_MSI
-static void pnv_ioda2_msi_eoi(struct irq_data *d)
+int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
 {
-       unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
-       struct irq_chip *chip = irq_data_get_irq_chip(d);
        struct pnv_phb *phb = container_of(chip, struct pnv_phb,
                                           ioda.irq_chip);
+
+       return opal_pci_msi_eoi(phb->opal_id, hw_irq);
+}
+
+static void pnv_ioda2_msi_eoi(struct irq_data *d)
+{
        int64_t rc;
+       unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+       struct irq_chip *chip = irq_data_get_irq_chip(d);
 
-       rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
+       rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
        WARN_ON_ONCE(rc);
 
        icp_native_eoi(d);
@@ -2756,6 +2762,16 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
        irq_set_chip(virq, &phb->ioda.irq_chip);
 }
 
+/*
+ * Returns true iff chip is something that we could call
+ * pnv_opal_pci_msi_eoi for.
+ */
+bool is_pnv_opal_msi(struct irq_chip *chip)
+{
+       return chip->irq_eoi == pnv_ioda2_msi_eoi;
+}
+EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
+
 static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
                                  unsigned int hwirq, unsigned int virq,
                                  unsigned int is_64, struct msi_msg *msg)
index 8e5daf7..a41faf3 100644 (file)
@@ -28,7 +28,7 @@
 
 #define KVM_S390_BSCA_CPU_SLOTS 64
 #define KVM_S390_ESCA_CPU_SLOTS 248
-#define KVM_MAX_VCPUS KVM_S390_ESCA_CPU_SLOTS
+#define KVM_MAX_VCPUS 255
 #define KVM_USER_MEM_SLOTS 32
 
 /*
@@ -245,72 +245,72 @@ struct sie_page {
 } __packed;
 
 struct kvm_vcpu_stat {
-       u32 exit_userspace;
-       u32 exit_null;
-       u32 exit_external_request;
-       u32 exit_external_interrupt;
-       u32 exit_stop_request;
-       u32 exit_validity;
-       u32 exit_instruction;
-       u32 exit_pei;
-       u32 halt_successful_poll;
-       u32 halt_attempted_poll;
-       u32 halt_poll_invalid;
-       u32 halt_wakeup;
-       u32 instruction_lctl;
-       u32 instruction_lctlg;
-       u32 instruction_stctl;
-       u32 instruction_stctg;
-       u32 exit_program_interruption;
-       u32 exit_instr_and_program;
-       u32 exit_operation_exception;
-       u32 deliver_external_call;
-       u32 deliver_emergency_signal;
-       u32 deliver_service_signal;
-       u32 deliver_virtio_interrupt;
-       u32 deliver_stop_signal;
-       u32 deliver_prefix_signal;
-       u32 deliver_restart_signal;
-       u32 deliver_program_int;
-       u32 deliver_io_int;
-       u32 exit_wait_state;
-       u32 instruction_pfmf;
-       u32 instruction_stidp;
-       u32 instruction_spx;
-       u32 instruction_stpx;
-       u32 instruction_stap;
-       u32 instruction_storage_key;
-       u32 instruction_ipte_interlock;
-       u32 instruction_stsch;
-       u32 instruction_chsc;
-       u32 instruction_stsi;
-       u32 instruction_stfl;
-       u32 instruction_tprot;
-       u32 instruction_sie;
-       u32 instruction_essa;
-       u32 instruction_sthyi;
-       u32 instruction_sigp_sense;
-       u32 instruction_sigp_sense_running;
-       u32 instruction_sigp_external_call;
-       u32 instruction_sigp_emergency;
-       u32 instruction_sigp_cond_emergency;
-       u32 instruction_sigp_start;
-       u32 instruction_sigp_stop;
-       u32 instruction_sigp_stop_store_status;
-       u32 instruction_sigp_store_status;
-       u32 instruction_sigp_store_adtl_status;
-       u32 instruction_sigp_arch;
-       u32 instruction_sigp_prefix;
-       u32 instruction_sigp_restart;
-       u32 instruction_sigp_init_cpu_reset;
-       u32 instruction_sigp_cpu_reset;
-       u32 instruction_sigp_unknown;
-       u32 diagnose_10;
-       u32 diagnose_44;
-       u32 diagnose_9c;
-       u32 diagnose_258;
-       u32 diagnose_308;
-       u32 diagnose_500;
+       u64 exit_userspace;
+       u64 exit_null;
+       u64 exit_external_request;
+       u64 exit_external_interrupt;
+       u64 exit_stop_request;
+       u64 exit_validity;
+       u64 exit_instruction;
+       u64 exit_pei;
+       u64 halt_successful_poll;
+       u64 halt_attempted_poll;
+       u64 halt_poll_invalid;
+       u64 halt_wakeup;
+       u64 instruction_lctl;
+       u64 instruction_lctlg;
+       u64 instruction_stctl;
+       u64 instruction_stctg;
+       u64 exit_program_interruption;
+       u64 exit_instr_and_program;
+       u64 exit_operation_exception;
+       u64 deliver_external_call;
+       u64 deliver_emergency_signal;
+       u64 deliver_service_signal;
+       u64 deliver_virtio_interrupt;
+       u64 deliver_stop_signal;
+       u64 deliver_prefix_signal;
+       u64 deliver_restart_signal;
+       u64 deliver_program_int;
+       u64 deliver_io_int;
+       u64 exit_wait_state;
+       u64 instruction_pfmf;
+       u64 instruction_stidp;
+       u64 instruction_spx;
+       u64 instruction_stpx;
+       u64 instruction_stap;
+       u64 instruction_storage_key;
+       u64 instruction_ipte_interlock;
+       u64 instruction_stsch;
+       u64 instruction_chsc;
+       u64 instruction_stsi;
+       u64 instruction_stfl;
+       u64 instruction_tprot;
+       u64 instruction_sie;
+       u64 instruction_essa;
+       u64 instruction_sthyi;
+       u64 instruction_sigp_sense;
+       u64 instruction_sigp_sense_running;
+       u64 instruction_sigp_external_call;
+       u64 instruction_sigp_emergency;
+       u64 instruction_sigp_cond_emergency;
+       u64 instruction_sigp_start;
+       u64 instruction_sigp_stop;
+       u64 instruction_sigp_stop_store_status;
+       u64 instruction_sigp_store_status;
+       u64 instruction_sigp_store_adtl_status;
+       u64 instruction_sigp_arch;
+       u64 instruction_sigp_prefix;
+       u64 instruction_sigp_restart;
+       u64 instruction_sigp_init_cpu_reset;
+       u64 instruction_sigp_cpu_reset;
+       u64 instruction_sigp_unknown;
+       u64 diagnose_10;
+       u64 diagnose_44;
+       u64 diagnose_9c;
+       u64 diagnose_258;
+       u64 diagnose_308;
+       u64 diagnose_500;
 };
 
 #define PGM_OPERATION                  0x01
@@ -577,7 +577,7 @@ struct kvm_vcpu_arch {
 };
 
 struct kvm_vm_stat {
-       u32 remote_tlb_flush;
+       ulong remote_tlb_flush;
 };
 
 struct kvm_arch_memory_slot {
index 1f95cc1..f3df9e0 100644 (file)
@@ -125,6 +125,7 @@ int main(void)
        OFFSET(__LC_STFL_FAC_LIST, lowcore, stfl_fac_list);
        OFFSET(__LC_STFLE_FAC_LIST, lowcore, stfle_fac_list);
        OFFSET(__LC_MCCK_CODE, lowcore, mcck_interruption_code);
+       OFFSET(__LC_EXT_DAMAGE_CODE, lowcore, external_damage_code);
        OFFSET(__LC_MCCK_FAIL_STOR_ADDR, lowcore, failing_storage_address);
        OFFSET(__LC_LAST_BREAK, lowcore, breaking_event_addr);
        OFFSET(__LC_RST_OLD_PSW, lowcore, restart_old_psw);
index 5420020..4aa8a7e 100644 (file)
@@ -495,6 +495,18 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
        tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
 
        switch (code) {
+       case PGM_PROTECTION:
+               switch (prot) {
+               case PROT_TYPE_ALC:
+                       tec->b60 = 1;
+                       /* FALL THROUGH */
+               case PROT_TYPE_DAT:
+                       tec->b61 = 1;
+                       break;
+               default: /* LA and KEYC set b61 to 0, other params undefined */
+                       return code;
+               }
+               /* FALL THROUGH */
        case PGM_ASCE_TYPE:
        case PGM_PAGE_TRANSLATION:
        case PGM_REGION_FIRST_TRANS:
@@ -504,8 +516,7 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
                /*
                 * op_access_id only applies to MOVE_PAGE -> set bit 61
                 * exc_access_id has to be set to 0 for some instructions. Both
-                * cases have to be handled by the caller. We can always store
-                * exc_access_id, as it is undefined for non-ar cases.
+                * cases have to be handled by the caller.
                 */
                tec->addr = gva >> PAGE_SHIFT;
                tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
@@ -516,25 +527,13 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
        case PGM_ASTE_VALIDITY:
        case PGM_ASTE_SEQUENCE:
        case PGM_EXTENDED_AUTHORITY:
+               /*
+                * We can always store exc_access_id, as it is
+                * undefined for non-ar cases. It is undefined for
+                * most DAT protection exceptions.
+                */
                pgm->exc_access_id = ar;
                break;
-       case PGM_PROTECTION:
-               switch (prot) {
-               case PROT_TYPE_ALC:
-                       tec->b60 = 1;
-                       /* FALL THROUGH */
-               case PROT_TYPE_DAT:
-                       tec->b61 = 1;
-                       tec->addr = gva >> PAGE_SHIFT;
-                       tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
-                       tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
-                       /* exc_access_id is undefined for most cases */
-                       pgm->exc_access_id = ar;
-                       break;
-               default: /* LA and KEYC set b61 to 0, other params undefined */
-                       break;
-               }
-               break;
        }
        return code;
 }
index 31a0533..d7c6a7f 100644 (file)
@@ -206,7 +206,7 @@ static int __import_wp_info(struct kvm_vcpu *vcpu,
 int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
                            struct kvm_guest_debug *dbg)
 {
-       int ret = 0, nr_wp = 0, nr_bp = 0, i, size;
+       int ret = 0, nr_wp = 0, nr_bp = 0, i;
        struct kvm_hw_breakpoint *bp_data = NULL;
        struct kvm_hw_wp_info_arch *wp_info = NULL;
        struct kvm_hw_bp_info_arch *bp_info = NULL;
@@ -216,17 +216,10 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
        else if (dbg->arch.nr_hw_bp > MAX_BP_COUNT)
                return -EINVAL;
 
-       size = dbg->arch.nr_hw_bp * sizeof(struct kvm_hw_breakpoint);
-       bp_data = kmalloc(size, GFP_KERNEL);
-       if (!bp_data) {
-               ret = -ENOMEM;
-               goto error;
-       }
-
-       if (copy_from_user(bp_data, dbg->arch.hw_bp, size)) {
-               ret = -EFAULT;
-               goto error;
-       }
+       bp_data = memdup_user(dbg->arch.hw_bp,
+                             sizeof(*bp_data) * dbg->arch.nr_hw_bp);
+       if (IS_ERR(bp_data))
+               return PTR_ERR(bp_data);
 
        for (i = 0; i < dbg->arch.nr_hw_bp; i++) {
                switch (bp_data[i].type) {
@@ -241,17 +234,19 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
                }
        }
 
-       size = nr_wp * sizeof(struct kvm_hw_wp_info_arch);
-       if (size > 0) {
-               wp_info = kmalloc(size, GFP_KERNEL);
+       if (nr_wp > 0) {
+               wp_info = kmalloc_array(nr_wp,
+                                       sizeof(*wp_info),
+                                       GFP_KERNEL);
                if (!wp_info) {
                        ret = -ENOMEM;
                        goto error;
                }
        }
-       size = nr_bp * sizeof(struct kvm_hw_bp_info_arch);
-       if (size > 0) {
-               bp_info = kmalloc(size, GFP_KERNEL);
+       if (nr_bp > 0) {
+               bp_info = kmalloc_array(nr_bp,
+                                       sizeof(*bp_info),
+                                       GFP_KERNEL);
                if (!bp_info) {
                        ret = -ENOMEM;
                        goto error;
@@ -382,14 +377,20 @@ void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu)
        vcpu->guest_debug &= ~KVM_GUESTDBG_EXIT_PENDING;
 }
 
+#define PER_CODE_MASK          (PER_EVENT_MASK >> 24)
+#define PER_CODE_BRANCH                (PER_EVENT_BRANCH >> 24)
+#define PER_CODE_IFETCH                (PER_EVENT_IFETCH >> 24)
+#define PER_CODE_STORE         (PER_EVENT_STORE >> 24)
+#define PER_CODE_STORE_REAL    (PER_EVENT_STORE_REAL >> 24)
+
 #define per_bp_event(code) \
-                       (code & (PER_EVENT_IFETCH | PER_EVENT_BRANCH))
+                       (code & (PER_CODE_IFETCH | PER_CODE_BRANCH))
 #define per_write_wp_event(code) \
-                       (code & (PER_EVENT_STORE | PER_EVENT_STORE_REAL))
+                       (code & (PER_CODE_STORE | PER_CODE_STORE_REAL))
 
 static int debug_exit_required(struct kvm_vcpu *vcpu)
 {
-       u32 perc = (vcpu->arch.sie_block->perc << 24);
+       u8 perc = vcpu->arch.sie_block->perc;
        struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch;
        struct kvm_hw_wp_info_arch *wp_info = NULL;
        struct kvm_hw_bp_info_arch *bp_info = NULL;
@@ -444,7 +445,7 @@ int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
        const u8 ilen = kvm_s390_get_ilen(vcpu);
        struct kvm_s390_pgm_info pgm_info = {
                .code = PGM_PER,
-               .per_code = PER_EVENT_IFETCH >> 24,
+               .per_code = PER_CODE_IFETCH,
                .per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen),
        };
 
@@ -458,33 +459,33 @@ int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
 
 static void filter_guest_per_event(struct kvm_vcpu *vcpu)
 {
-       u32 perc = vcpu->arch.sie_block->perc << 24;
+       const u8 perc = vcpu->arch.sie_block->perc;
        u64 peraddr = vcpu->arch.sie_block->peraddr;
        u64 addr = vcpu->arch.sie_block->gpsw.addr;
        u64 cr9 = vcpu->arch.sie_block->gcr[9];
        u64 cr10 = vcpu->arch.sie_block->gcr[10];
        u64 cr11 = vcpu->arch.sie_block->gcr[11];
        /* filter all events, demanded by the guest */
-       u32 guest_perc = perc & cr9 & PER_EVENT_MASK;
+       u8 guest_perc = perc & (cr9 >> 24) & PER_CODE_MASK;
 
        if (!guest_per_enabled(vcpu))
                guest_perc = 0;
 
        /* filter "successful-branching" events */
-       if (guest_perc & PER_EVENT_BRANCH &&
+       if (guest_perc & PER_CODE_BRANCH &&
            cr9 & PER_CONTROL_BRANCH_ADDRESS &&
            !in_addr_range(addr, cr10, cr11))
-               guest_perc &= ~PER_EVENT_BRANCH;
+               guest_perc &= ~PER_CODE_BRANCH;
 
        /* filter "instruction-fetching" events */
-       if (guest_perc & PER_EVENT_IFETCH &&
+       if (guest_perc & PER_CODE_IFETCH &&
            !in_addr_range(peraddr, cr10, cr11))
-               guest_perc &= ~PER_EVENT_IFETCH;
+               guest_perc &= ~PER_CODE_IFETCH;
 
        /* All other PER events will be given to the guest */
        /* TODO: Check altered address/address space */
 
-       vcpu->arch.sie_block->perc = guest_perc >> 24;
+       vcpu->arch.sie_block->perc = guest_perc;
 
        if (!guest_perc)
                vcpu->arch.sie_block->iprcc &= ~PGM_PER;
index dfd0ca2..1cab8a1 100644 (file)
@@ -29,6 +29,7 @@ static const intercept_handler_t instruction_handlers[256] = {
        [0x01] = kvm_s390_handle_01,
        [0x82] = kvm_s390_handle_lpsw,
        [0x83] = kvm_s390_handle_diag,
+       [0xaa] = kvm_s390_handle_aa,
        [0xae] = kvm_s390_handle_sigp,
        [0xb2] = kvm_s390_handle_b2,
        [0xb6] = kvm_s390_handle_stctl,
index 24524c0..be4db07 100644 (file)
@@ -24,6 +24,8 @@
 #include <asm/sclp.h>
 #include <asm/isc.h>
 #include <asm/gmap.h>
+#include <asm/switch_to.h>
+#include <asm/nmi.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
 #include "trace-s390.h"
@@ -40,6 +42,7 @@ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id)
        if (!(atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND))
                return 0;
 
+       BUG_ON(!kvm_s390_use_sca_entries());
        read_lock(&vcpu->kvm->arch.sca_lock);
        if (vcpu->kvm->arch.use_esca) {
                struct esca_block *sca = vcpu->kvm->arch.sca;
@@ -68,6 +71,7 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id)
 {
        int expect, rc;
 
+       BUG_ON(!kvm_s390_use_sca_entries());
        read_lock(&vcpu->kvm->arch.sca_lock);
        if (vcpu->kvm->arch.use_esca) {
                struct esca_block *sca = vcpu->kvm->arch.sca;
@@ -109,6 +113,8 @@ static void sca_clear_ext_call(struct kvm_vcpu *vcpu)
        struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
        int rc, expect;
 
+       if (!kvm_s390_use_sca_entries())
+               return;
        atomic_andnot(CPUSTAT_ECALL_PEND, li->cpuflags);
        read_lock(&vcpu->kvm->arch.sca_lock);
        if (vcpu->kvm->arch.use_esca) {
@@ -400,12 +406,78 @@ static int __must_check __deliver_pfault_init(struct kvm_vcpu *vcpu)
        return rc ? -EFAULT : 0;
 }
 
+static int __write_machine_check(struct kvm_vcpu *vcpu,
+                                struct kvm_s390_mchk_info *mchk)
+{
+       unsigned long ext_sa_addr;
+       freg_t fprs[NUM_FPRS];
+       union mci mci;
+       int rc;
+
+       mci.val = mchk->mcic;
+       /* take care of lazy register loading via vcpu load/put */
+       save_fpu_regs();
+       save_access_regs(vcpu->run->s.regs.acrs);
+
+       /* Extended save area */
+       rc = read_guest_lc(vcpu, __LC_VX_SAVE_AREA_ADDR, &ext_sa_addr,
+                           sizeof(unsigned long));
+       /* Only bits 0-53 are used for address formation */
+       ext_sa_addr &= ~0x3ffUL;
+       if (!rc && mci.vr && ext_sa_addr && test_kvm_facility(vcpu->kvm, 129)) {
+               if (write_guest_abs(vcpu, ext_sa_addr, vcpu->run->s.regs.vrs,
+                                   512))
+                       mci.vr = 0;
+       } else {
+               mci.vr = 0;
+       }
+
+       /* General interruption information */
+       rc |= put_guest_lc(vcpu, 1, (u8 __user *) __LC_AR_MODE_ID);
+       rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW,
+                            &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+       rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW,
+                           &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+       rc |= put_guest_lc(vcpu, mci.val, (u64 __user *) __LC_MCCK_CODE);
+
+       /* Register-save areas */
+       if (MACHINE_HAS_VX) {
+               convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs);
+               rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA, fprs, 128);
+       } else {
+               rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA,
+                                    vcpu->run->s.regs.fprs, 128);
+       }
+       rc |= write_guest_lc(vcpu, __LC_GPREGS_SAVE_AREA,
+                            vcpu->run->s.regs.gprs, 128);
+       rc |= put_guest_lc(vcpu, current->thread.fpu.fpc,
+                          (u32 __user *) __LC_FP_CREG_SAVE_AREA);
+       rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->todpr,
+                          (u32 __user *) __LC_TOD_PROGREG_SAVE_AREA);
+       rc |= put_guest_lc(vcpu, kvm_s390_get_cpu_timer(vcpu),
+                          (u64 __user *) __LC_CPU_TIMER_SAVE_AREA);
+       rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->ckc >> 8,
+                          (u64 __user *) __LC_CLOCK_COMP_SAVE_AREA);
+       rc |= write_guest_lc(vcpu, __LC_AREGS_SAVE_AREA,
+                            &vcpu->run->s.regs.acrs, 64);
+       rc |= write_guest_lc(vcpu, __LC_CREGS_SAVE_AREA,
+                            &vcpu->arch.sie_block->gcr, 128);
+
+       /* Extended interruption information */
+       rc |= put_guest_lc(vcpu, mchk->ext_damage_code,
+                          (u32 __user *) __LC_EXT_DAMAGE_CODE);
+       rc |= put_guest_lc(vcpu, mchk->failing_storage_address,
+                          (u64 __user *) __LC_MCCK_FAIL_STOR_ADDR);
+       rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA, &mchk->fixed_logout,
+                            sizeof(mchk->fixed_logout));
+       return rc ? -EFAULT : 0;
+}
+
 static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu)
 {
        struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
        struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
        struct kvm_s390_mchk_info mchk = {};
-       unsigned long adtl_status_addr;
        int deliver = 0;
        int rc = 0;
 
@@ -446,29 +518,9 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu)
                trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
                                                 KVM_S390_MCHK,
                                                 mchk.cr14, mchk.mcic);
-
-               rc  = kvm_s390_vcpu_store_status(vcpu,
-                                                KVM_S390_STORE_STATUS_PREFIXED);
-               rc |= read_guest_lc(vcpu, __LC_VX_SAVE_AREA_ADDR,
-                                   &adtl_status_addr,
-                                   sizeof(unsigned long));
-               rc |= kvm_s390_vcpu_store_adtl_status(vcpu,
-                                                     adtl_status_addr);
-               rc |= put_guest_lc(vcpu, mchk.mcic,
-                                  (u64 __user *) __LC_MCCK_CODE);
-               rc |= put_guest_lc(vcpu, mchk.failing_storage_address,
-                                  (u64 __user *) __LC_MCCK_FAIL_STOR_ADDR);
-               rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA,
-                                    &mchk.fixed_logout,
-                                    sizeof(mchk.fixed_logout));
-               rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW,
-                                    &vcpu->arch.sie_block->gpsw,
-                                    sizeof(psw_t));
-               rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW,
-                                   &vcpu->arch.sie_block->gpsw,
-                                   sizeof(psw_t));
+               rc = __write_machine_check(vcpu, &mchk);
        }
-       return rc ? -EFAULT : 0;
+       return rc;
 }
 
 static int __must_check __deliver_restart(struct kvm_vcpu *vcpu)
index 7e8cb6a..9c7a1ec 100644 (file)
@@ -384,7 +384,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_NR_VCPUS:
        case KVM_CAP_MAX_VCPUS:
                r = KVM_S390_BSCA_CPU_SLOTS;
-               if (sclp.has_esca && sclp.has_64bscao)
+               if (!kvm_s390_use_sca_entries())
+                       r = KVM_MAX_VCPUS;
+               else if (sclp.has_esca && sclp.has_64bscao)
                        r = KVM_S390_ESCA_CPU_SLOTS;
                break;
        case KVM_CAP_NR_MEMSLOTS:
@@ -1498,6 +1500,16 @@ out_err:
        return rc;
 }
 
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+       return false;
+}
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
        VCPU_EVENT(vcpu, 3, "%s", "free cpu");
@@ -1561,6 +1573,8 @@ static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
 
 static void sca_del_vcpu(struct kvm_vcpu *vcpu)
 {
+       if (!kvm_s390_use_sca_entries())
+               return;
        read_lock(&vcpu->kvm->arch.sca_lock);
        if (vcpu->kvm->arch.use_esca) {
                struct esca_block *sca = vcpu->kvm->arch.sca;
@@ -1578,6 +1592,13 @@ static void sca_del_vcpu(struct kvm_vcpu *vcpu)
 
 static void sca_add_vcpu(struct kvm_vcpu *vcpu)
 {
+       if (!kvm_s390_use_sca_entries()) {
+               struct bsca_block *sca = vcpu->kvm->arch.sca;
+
+               /* we still need the basic sca for the ipte control */
+               vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
+               vcpu->arch.sie_block->scaol = (__u32)(__u64)sca;
+       }
        read_lock(&vcpu->kvm->arch.sca_lock);
        if (vcpu->kvm->arch.use_esca) {
                struct esca_block *sca = vcpu->kvm->arch.sca;
@@ -1658,6 +1679,11 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id)
 {
        int rc;
 
+       if (!kvm_s390_use_sca_entries()) {
+               if (id < KVM_MAX_VCPUS)
+                       return true;
+               return false;
+       }
        if (id < KVM_S390_BSCA_CPU_SLOTS)
                return true;
        if (!sclp.has_esca || !sclp.has_64bscao)
@@ -1946,8 +1972,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
                vcpu->arch.sie_block->eca |= 1;
        if (sclp.has_sigpif)
                vcpu->arch.sie_block->eca |= 0x10000000U;
-       if (test_kvm_facility(vcpu->kvm, 64))
-               vcpu->arch.sie_block->ecb3 |= 0x01;
        if (test_kvm_facility(vcpu->kvm, 129)) {
                vcpu->arch.sie_block->eca |= 0x00020000;
                vcpu->arch.sie_block->ecd |= 0x20000000;
@@ -2704,6 +2728,19 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
                        kvm_clear_async_pf_completion_queue(vcpu);
        }
+       /*
+        * If userspace sets the riccb (e.g. after migration) to a valid state,
+        * we should enable RI here instead of doing the lazy enablement.
+        */
+       if ((kvm_run->kvm_dirty_regs & KVM_SYNC_RICCB) &&
+           test_kvm_facility(vcpu->kvm, 64)) {
+               struct runtime_instr_cb *riccb =
+                       (struct runtime_instr_cb *) &kvm_run->s.regs.riccb;
+
+               if (riccb->valid)
+                       vcpu->arch.sie_block->ecb3 |= 0x01;
+       }
+
        kvm_run->kvm_dirty_regs = 0;
 }
 
@@ -2847,38 +2884,6 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
        return kvm_s390_store_status_unloaded(vcpu, addr);
 }
 
-/*
- * store additional status at address
- */
-int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu,
-                                       unsigned long gpa)
-{
-       /* Only bits 0-53 are used for address formation */
-       if (!(gpa & ~0x3ff))
-               return 0;
-
-       return write_guest_abs(vcpu, gpa & ~0x3ff,
-                              (void *)&vcpu->run->s.regs.vrs, 512);
-}
-
-int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr)
-{
-       if (!test_kvm_facility(vcpu->kvm, 129))
-               return 0;
-
-       /*
-        * The guest VXRS are in the host VXRs due to the lazy
-        * copying in vcpu load/put. We can simply call save_fpu_regs()
-        * to save the current register state because we are in the
-        * middle of a load/put cycle.
-        *
-        * Let's update our copies before we save it into the save area.
-        */
-       save_fpu_regs();
-
-       return kvm_s390_store_adtl_status_unloaded(vcpu, addr);
-}
-
 static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
 {
        kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu);
index b843286..3a4e97f 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/kvm_host.h>
 #include <asm/facility.h>
 #include <asm/processor.h>
+#include <asm/sclp.h>
 
 typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
 
@@ -245,6 +246,7 @@ static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu)
 
 /* implemented in priv.c */
 int is_valid_psw(psw_t *psw);
+int kvm_s390_handle_aa(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_e5(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_01(struct kvm_vcpu *vcpu);
@@ -273,10 +275,7 @@ int handle_sthyi(struct kvm_vcpu *vcpu);
 void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod);
 long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
 int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
-int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu,
-                                       unsigned long addr);
 int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr);
-int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr);
 void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu);
@@ -389,4 +388,13 @@ static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm)
 
        return &sca->ipte_control;
 }
+static inline int kvm_s390_use_sca_entries(void)
+{
+       /*
+        * Without SIGP interpretation, only SRS interpretation (if available)
+        * might use the entries. By not setting the entries and keeping them
+        * invalid, hardware will not access them but intercept.
+        */
+       return sclp.has_sigpif;
+}
 #endif
index 4616038..e184353 100644 (file)
 #include "kvm-s390.h"
 #include "trace.h"
 
+static int handle_ri(struct kvm_vcpu *vcpu)
+{
+       if (test_kvm_facility(vcpu->kvm, 64)) {
+               vcpu->arch.sie_block->ecb3 |= 0x01;
+               kvm_s390_retry_instr(vcpu);
+               return 0;
+       } else
+               return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+}
+
+int kvm_s390_handle_aa(struct kvm_vcpu *vcpu)
+{
+       if ((vcpu->arch.sie_block->ipa & 0xf) <= 4)
+               return handle_ri(vcpu);
+       else
+               return -EOPNOTSUPP;
+}
+
 /* Handle SCK (SET CLOCK) interception */
 static int handle_set_clock(struct kvm_vcpu *vcpu)
 {
@@ -1093,6 +1111,9 @@ static int handle_stctg(struct kvm_vcpu *vcpu)
 static const intercept_handler_t eb_handlers[256] = {
        [0x2f] = handle_lctlg,
        [0x25] = handle_stctg,
+       [0x60] = handle_ri,
+       [0x61] = handle_ri,
+       [0x62] = handle_ri,
 };
 
 int kvm_s390_handle_eb(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/configs/kvm_guest.config b/arch/x86/configs/kvm_guest.config
deleted file mode 100644 (file)
index 9906505..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-CONFIG_NET=y
-CONFIG_NET_CORE=y
-CONFIG_NETDEVICES=y
-CONFIG_BLOCK=y
-CONFIG_BLK_DEV=y
-CONFIG_NETWORK_FILESYSTEMS=y
-CONFIG_INET=y
-CONFIG_TTY=y
-CONFIG_SERIAL_8250=y
-CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_IP_PNP=y
-CONFIG_IP_PNP_DHCP=y
-CONFIG_BINFMT_ELF=y
-CONFIG_PCI=y
-CONFIG_PCI_MSI=y
-CONFIG_DEBUG_KERNEL=y
-CONFIG_VIRTUALIZATION=y
-CONFIG_HYPERVISOR_GUEST=y
-CONFIG_PARAVIRT=y
-CONFIG_KVM_GUEST=y
-CONFIG_VIRTIO=y
-CONFIG_VIRTIO_PCI=y
-CONFIG_VIRTIO_BLK=y
-CONFIG_VIRTIO_CONSOLE=y
-CONFIG_VIRTIO_NET=y
-CONFIG_9P_FS=y
-CONFIG_NET_9P=y
-CONFIG_NET_9P_VIRTIO=y
-CONFIG_SCSI_LOWLEVEL=y
-CONFIG_SCSI_VIRTIO=y
-CONFIG_VIRTIO_INPUT=y
index 94d54d0..02223cb 100644 (file)
@@ -129,7 +129,7 @@ static notrace cycle_t vread_pvclock(int *mode)
                        return 0;
                }
 
-               ret = __pvclock_read_cycles(pvti);
+               ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
        } while (pvclock_read_retry(pvti, version));
 
        /* refer to vread_tsc() comment for rationale */
index 33ae3a4..4b20f73 100644 (file)
@@ -568,6 +568,7 @@ struct kvm_vcpu_arch {
                struct kvm_steal_time steal;
        } st;
 
+       u64 tsc_offset;
        u64 last_guest_tsc;
        u64 last_host_tsc;
        u64 tsc_offset_adjustment;
@@ -701,6 +702,8 @@ struct kvm_hv {
        /* Hyper-v based guest crash (NT kernel bugcheck) parameters */
        u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS];
        u64 hv_crash_ctl;
+
+       HV_REFERENCE_TSC_PAGE tsc_ref;
 };
 
 struct kvm_arch {
@@ -781,54 +784,56 @@ struct kvm_arch {
        bool disabled_lapic_found;
 
        /* Struct members for AVIC */
+       u32 avic_vm_id;
        u32 ldr_mode;
        struct page *avic_logical_id_table_page;
        struct page *avic_physical_id_table_page;
+       struct hlist_node hnode;
 
        bool x2apic_format;
        bool x2apic_broadcast_quirk_disabled;
 };
 
 struct kvm_vm_stat {
-       u32 mmu_shadow_zapped;
-       u32 mmu_pte_write;
-       u32 mmu_pte_updated;
-       u32 mmu_pde_zapped;
-       u32 mmu_flooded;
-       u32 mmu_recycled;
-       u32 mmu_cache_miss;
-       u32 mmu_unsync;
-       u32 remote_tlb_flush;
-       u32 lpages;
+       ulong mmu_shadow_zapped;
+       ulong mmu_pte_write;
+       ulong mmu_pte_updated;
+       ulong mmu_pde_zapped;
+       ulong mmu_flooded;
+       ulong mmu_recycled;
+       ulong mmu_cache_miss;
+       ulong mmu_unsync;
+       ulong remote_tlb_flush;
+       ulong lpages;
 };
 
 struct kvm_vcpu_stat {
-       u32 pf_fixed;
-       u32 pf_guest;
-       u32 tlb_flush;
-       u32 invlpg;
-
-       u32 exits;
-       u32 io_exits;
-       u32 mmio_exits;
-       u32 signal_exits;
-       u32 irq_window_exits;
-       u32 nmi_window_exits;
-       u32 halt_exits;
-       u32 halt_successful_poll;
-       u32 halt_attempted_poll;
-       u32 halt_poll_invalid;
-       u32 halt_wakeup;
-       u32 request_irq_exits;
-       u32 irq_exits;
-       u32 host_state_reload;
-       u32 efer_reload;
-       u32 fpu_reload;
-       u32 insn_emulation;
-       u32 insn_emulation_fail;
-       u32 hypercalls;
-       u32 irq_injections;
-       u32 nmi_injections;
+       u64 pf_fixed;
+       u64 pf_guest;
+       u64 tlb_flush;
+       u64 invlpg;
+
+       u64 exits;
+       u64 io_exits;
+       u64 mmio_exits;
+       u64 signal_exits;
+       u64 irq_window_exits;
+       u64 nmi_window_exits;
+       u64 halt_exits;
+       u64 halt_successful_poll;
+       u64 halt_attempted_poll;
+       u64 halt_poll_invalid;
+       u64 halt_wakeup;
+       u64 request_irq_exits;
+       u64 irq_exits;
+       u64 host_state_reload;
+       u64 efer_reload;
+       u64 fpu_reload;
+       u64 insn_emulation;
+       u64 insn_emulation_fail;
+       u64 hypercalls;
+       u64 irq_injections;
+       u64 nmi_injections;
 };
 
 struct x86_instruction_info;
@@ -951,7 +956,6 @@ struct kvm_x86_ops {
 
        bool (*has_wbinvd_exit)(void);
 
-       u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
        void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
        u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc);
index d019f0c..3ad741b 100644 (file)
@@ -87,9 +87,10 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
 }
 
 static __always_inline
-cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src)
+cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
+                             u64 tsc)
 {
-       u64 delta = rdtsc_ordered() - src->tsc_timestamp;
+       u64 delta = tsc - src->tsc_timestamp;
        cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
                                             src->tsc_shift);
        return src->system_time + offset;
index 3599404..5b2cc88 100644 (file)
@@ -80,7 +80,7 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 
        do {
                version = pvclock_read_begin(src);
-               ret = __pvclock_read_cycles(src);
+               ret = __pvclock_read_cycles(src, rdtsc_ordered());
                flags = src->flags;
        } while (pvclock_read_retry(src, version));
 
index 464fa47..3bff207 100644 (file)
@@ -13,7 +13,7 @@ kvm-$(CONFIG_KVM_ASYNC_PF)    += $(KVM)/async_pf.o
 
 kvm-y                  += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
                           i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
-                          hyperv.o page_track.o
+                          hyperv.o page_track.o debugfs.o
 
 kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)    += assigned-dev.o iommu.o
 
index 3235e0f..afa7bbb 100644 (file)
@@ -366,7 +366,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
                F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
                F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
-               F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB);
+               F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
+               F(AVX512BW) | F(AVX512VL);
 
        /* cpuid 0xD.1.eax */
        const u32 kvm_cpuid_D_1_eax_x86_features =
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
new file mode 100644 (file)
index 0000000..c19c7ed
--- /dev/null
@@ -0,0 +1,69 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * Copyright 2016 Red Hat, Inc. and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+#include <linux/kvm_host.h>
+#include <linux/debugfs.h>
+
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+       return true;
+}
+
+static int vcpu_get_tsc_offset(void *data, u64 *val)
+{
+       struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+       *val = vcpu->arch.tsc_offset;
+       return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_offset_fops, vcpu_get_tsc_offset, NULL, "%lld\n");
+
+static int vcpu_get_tsc_scaling_ratio(void *data, u64 *val)
+{
+       struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+       *val = vcpu->arch.tsc_scaling_ratio;
+       return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, "%llu\n");
+
+static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val)
+{
+       *val = kvm_tsc_scaling_ratio_frac_bits;
+       return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n");
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+       struct dentry *ret;
+
+       ret = debugfs_create_file("tsc-offset", 0444,
+                                                       vcpu->debugfs_dentry,
+                                                       vcpu, &vcpu_tsc_offset_fops);
+       if (!ret)
+               return -ENOMEM;
+
+       if (kvm_has_tsc_control) {
+               ret = debugfs_create_file("tsc-scaling-ratio", 0444,
+                                                       vcpu->debugfs_dentry,
+                                                       vcpu, &vcpu_tsc_scaling_fops);
+               if (!ret)
+                       return -ENOMEM;
+               ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444,
+                                                       vcpu->debugfs_dentry,
+                                                       vcpu, &vcpu_tsc_scaling_frac_fops);
+               if (!ret)
+                       return -ENOMEM;
+
+       }
+
+       return 0;
+}
index 01bd7b7..42b1c83 100644 (file)
@@ -386,7 +386,21 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic)
 
 static u64 get_time_ref_counter(struct kvm *kvm)
 {
-       return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
+       struct kvm_hv *hv = &kvm->arch.hyperv;
+       struct kvm_vcpu *vcpu;
+       u64 tsc;
+
+       /*
+        * The guest has not set up the TSC page or the clock isn't
+        * stable, fall back to get_kvmclock_ns.
+        */
+       if (!hv->tsc_ref.tsc_sequence)
+               return div_u64(get_kvmclock_ns(kvm), 100);
+
+       vcpu = kvm_get_vcpu(kvm, 0);
+       tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+       return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64)
+               + hv->tsc_ref.tsc_offset;
 }
 
 static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
@@ -756,6 +770,129 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
        return 0;
 }
 
+/*
+ * The kvmclock and Hyper-V TSC page use similar formulas, and converting
+ * between them is possible:
+ *
+ * kvmclock formula:
+ *    nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           + system_time
+ *
+ * Hyper-V formula:
+ *    nsec/100 = ticks * scale / 2^64 + offset
+ *
+ * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula.
+ * By dividing the kvmclock formula by 100 and equating what's left we get:
+ *    ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *            scale / 2^64 =         tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *            scale        =         tsc_to_system_mul * 2^(32+tsc_shift) / 100
+ *
+ * Now expand the kvmclock formula and divide by 100:
+ *    nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           + system_time
+ *    nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *               - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *               + system_time / 100
+ *
+ * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64:
+ *    nsec/100 = ticks * scale / 2^64
+ *               - tsc_timestamp * scale / 2^64
+ *               + system_time / 100
+ *
+ * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out:
+ *    offset = system_time / 100 - tsc_timestamp * scale / 2^64
+ *
+ * These two equivalencies are implemented in this function.
+ */
+static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
+                                       HV_REFERENCE_TSC_PAGE *tsc_ref)
+{
+       u64 max_mul;
+
+       if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
+               return false;
+
+       /*
+        * check if scale would overflow, if so we use the time ref counter
+        *    tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64
+        *    tsc_to_system_mul / 100 >= 2^(32-tsc_shift)
+        *    tsc_to_system_mul >= 100 * 2^(32-tsc_shift)
+        */
+       max_mul = 100ull << (32 - hv_clock->tsc_shift);
+       if (hv_clock->tsc_to_system_mul >= max_mul)
+               return false;
+
+       /*
+        * Otherwise compute the scale and offset according to the formulas
+        * derived above.
+        */
+       tsc_ref->tsc_scale =
+               mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift),
+                               hv_clock->tsc_to_system_mul,
+                               100);
+
+       tsc_ref->tsc_offset = hv_clock->system_time;
+       do_div(tsc_ref->tsc_offset, 100);
+       tsc_ref->tsc_offset -=
+               mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64);
+       return true;
+}
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+                          struct pvclock_vcpu_time_info *hv_clock)
+{
+       struct kvm_hv *hv = &kvm->arch.hyperv;
+       u32 tsc_seq;
+       u64 gfn;
+
+       BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
+       BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0);
+
+       if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+               return;
+
+       gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+       /*
+        * Because the TSC parameters only vary when there is a
+        * change in the master clock, do not bother with caching.
+        */
+       if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
+                                   &tsc_seq, sizeof(tsc_seq))))
+               return;
+
+       /*
+        * While we're computing and writing the parameters, force the
+        * guest to use the time reference count MSR.
+        */
+       hv->tsc_ref.tsc_sequence = 0;
+       if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+                           &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+               return;
+
+       if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
+               return;
+
+       /* Ensure sequence is zero before writing the rest of the struct.  */
+       smp_wmb();
+       if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
+               return;
+
+       /*
+        * Now switch to the TSC page mechanism by writing the sequence.
+        */
+       tsc_seq++;
+       if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0)
+               tsc_seq = 1;
+
+       /* Write the struct entirely before the non-zero sequence.  */
+       smp_wmb();
+
+       hv->tsc_ref.tsc_sequence = tsc_seq;
+       kvm_write_guest(kvm, gfn_to_gpa(gfn),
+                       &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
+}
+
 static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
                             bool host)
 {
@@ -793,23 +930,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
                mark_page_dirty(kvm, gfn);
                break;
        }
-       case HV_X64_MSR_REFERENCE_TSC: {
-               u64 gfn;
-               HV_REFERENCE_TSC_PAGE tsc_ref;
-
-               memset(&tsc_ref, 0, sizeof(tsc_ref));
+       case HV_X64_MSR_REFERENCE_TSC:
                hv->hv_tsc_page = data;
-               if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
-                       break;
-               gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
-               if (kvm_write_guest(
-                               kvm,
-                               gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
-                               &tsc_ref, sizeof(tsc_ref)))
-                       return 1;
-               mark_page_dirty(kvm, gfn);
+               if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
+                       kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
                break;
-       }
        case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
                return kvm_hv_msr_set_crash_data(vcpu,
                                                 msr - HV_X64_MSR_CRASH_P0,
index 60eccd4..cd11195 100644 (file)
@@ -84,4 +84,7 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
 
 void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
 
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+                          struct pvclock_vcpu_time_info *hv_clock);
+
 #endif
index b62c852..23b99f3 100644 (file)
@@ -1761,9 +1761,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
                if (value & MSR_IA32_APICBASE_ENABLE) {
                        kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
                        static_key_slow_dec_deferred(&apic_hw_disabled);
-               } else
+               } else {
                        static_key_slow_inc(&apic_hw_disabled.key);
-               recalculate_apic_map(vcpu->kvm);
+                       recalculate_apic_map(vcpu->kvm);
+               }
        }
 
        if ((old_value ^ value) & X2APIC_ENABLE) {
index 3d4cc8c..d9c7e98 100644 (file)
@@ -1207,7 +1207,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
  *
  * Return true if tlb need be flushed.
  */
-static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
+static bool spte_write_protect(u64 *sptep, bool pt_protect)
 {
        u64 spte = *sptep;
 
@@ -1233,12 +1233,12 @@ static bool __rmap_write_protect(struct kvm *kvm,
        bool flush = false;
 
        for_each_rmap_spte(rmap_head, &iter, sptep)
-               flush |= spte_write_protect(kvm, sptep, pt_protect);
+               flush |= spte_write_protect(sptep, pt_protect);
 
        return flush;
 }
 
-static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep)
+static bool spte_clear_dirty(u64 *sptep)
 {
        u64 spte = *sptep;
 
@@ -1256,12 +1256,12 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
        bool flush = false;
 
        for_each_rmap_spte(rmap_head, &iter, sptep)
-               flush |= spte_clear_dirty(kvm, sptep);
+               flush |= spte_clear_dirty(sptep);
 
        return flush;
 }
 
-static bool spte_set_dirty(struct kvm *kvm, u64 *sptep)
+static bool spte_set_dirty(u64 *sptep)
 {
        u64 spte = *sptep;
 
@@ -1279,7 +1279,7 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
        bool flush = false;
 
        for_each_rmap_spte(rmap_head, &iter, sptep)
-               flush |= spte_set_dirty(kvm, sptep);
+               flush |= spte_set_dirty(sptep);
 
        return flush;
 }
index 1e6b84b..f8157a3 100644 (file)
@@ -34,6 +34,8 @@
 #include <linux/sched.h>
 #include <linux/trace_events.h>
 #include <linux/slab.h>
+#include <linux/amd-iommu.h>
+#include <linux/hashtable.h>
 
 #include <asm/apic.h>
 #include <asm/perf_event.h>
@@ -41,6 +43,7 @@
 #include <asm/desc.h>
 #include <asm/debugreg.h>
 #include <asm/kvm_para.h>
+#include <asm/irq_remapping.h>
 
 #include <asm/virtext.h>
 #include "trace.h"
@@ -96,6 +99,19 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
 #define AVIC_UNACCEL_ACCESS_OFFSET_MASK                0xFF0
 #define AVIC_UNACCEL_ACCESS_VECTOR_MASK                0xFFFFFFFF
 
+/* AVIC GATAG is encoded using VM and VCPU IDs */
+#define AVIC_VCPU_ID_BITS              8
+#define AVIC_VCPU_ID_MASK              ((1 << AVIC_VCPU_ID_BITS) - 1)
+
+#define AVIC_VM_ID_BITS                        24
+#define AVIC_VM_ID_NR                  (1 << AVIC_VM_ID_BITS)
+#define AVIC_VM_ID_MASK                        ((1 << AVIC_VM_ID_BITS) - 1)
+
+#define AVIC_GATAG(x, y)               (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
+                                               (y & AVIC_VCPU_ID_MASK))
+#define AVIC_GATAG_TO_VMID(x)          ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
+#define AVIC_GATAG_TO_VCPUID(x)                (x & AVIC_VCPU_ID_MASK)
+
 static bool erratum_383_found __read_mostly;
 
 static const u32 host_save_user_msrs[] = {
@@ -185,6 +201,23 @@ struct vcpu_svm {
        struct page *avic_backing_page;
        u64 *avic_physical_id_cache;
        bool avic_is_running;
+
+       /*
+        * Per-vcpu list of struct amd_svm_iommu_ir:
+        * This is used mainly to store interrupt remapping information used
+        * when update the vcpu affinity. This avoids the need to scan for
+        * IRTE and try to match ga_tag in the IOMMU driver.
+        */
+       struct list_head ir_list;
+       spinlock_t ir_list_lock;
+};
+
+/*
+ * This is a wrapper of struct amd_iommu_ir_data.
+ */
+struct amd_svm_iommu_ir {
+       struct list_head node;  /* Used by SVM for per-vcpu ir_list */
+       void *data;             /* Storing pointer to struct amd_ir_data */
 };
 
 #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK   (0xFF)
@@ -242,6 +275,10 @@ static int avic;
 module_param(avic, int, S_IRUGO);
 #endif
 
+/* AVIC VM ID bit masks and lock */
+static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR);
+static DEFINE_SPINLOCK(avic_vm_id_lock);
+
 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
@@ -928,6 +965,55 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
        set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 }
 
+/* Note:
+ * This hash table is used to map VM_ID to a struct kvm_arch,
+ * when handling AMD IOMMU GALOG notification to schedule in
+ * a particular vCPU.
+ */
+#define SVM_VM_DATA_HASH_BITS  8
+DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static spinlock_t svm_vm_data_hash_lock;
+
+/* Note:
+ * This function is called from IOMMU driver to notify
+ * SVM to schedule in a particular vCPU of a particular VM.
+ */
+static int avic_ga_log_notifier(u32 ga_tag)
+{
+       unsigned long flags;
+       struct kvm_arch *ka = NULL;
+       struct kvm_vcpu *vcpu = NULL;
+       u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
+       u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
+
+       pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
+
+       spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+       hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) {
+               struct kvm *kvm = container_of(ka, struct kvm, arch);
+               struct kvm_arch *vm_data = &kvm->arch;
+
+               if (vm_data->avic_vm_id != vm_id)
+                       continue;
+               vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+               break;
+       }
+       spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+       if (!vcpu)
+               return 0;
+
+       /* Note:
+        * At this point, the IOMMU should have already set the pending
+        * bit in the vAPIC backing page. So, we just need to schedule
+        * in the vcpu.
+        */
+       if (vcpu->mode == OUTSIDE_GUEST_MODE)
+               kvm_vcpu_wake_up(vcpu);
+
+       return 0;
+}
+
 static __init int svm_hardware_setup(void)
 {
        int cpu;
@@ -986,10 +1072,15 @@ static __init int svm_hardware_setup(void)
        if (avic) {
                if (!npt_enabled ||
                    !boot_cpu_has(X86_FEATURE_AVIC) ||
-                   !IS_ENABLED(CONFIG_X86_LOCAL_APIC))
+                   !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
                        avic = false;
-               else
+               } else {
                        pr_info("AVIC enabled\n");
+
+                       hash_init(svm_vm_data_hash);
+                       spin_lock_init(&svm_vm_data_hash_lock);
+                       amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+               }
        }
 
        return 0;
@@ -1028,13 +1119,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
        seg->base = 0;
 }
 
-static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       return svm->vmcb->control.tsc_offset;
-}
-
 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -1280,19 +1364,55 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static inline int avic_get_next_vm_id(void)
+{
+       int id;
+
+       spin_lock(&avic_vm_id_lock);
+
+       /* AVIC VM ID is one-based. */
+       id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1);
+       if (id <= AVIC_VM_ID_MASK)
+               __set_bit(id, avic_vm_id_bitmap);
+       else
+               id = -EAGAIN;
+
+       spin_unlock(&avic_vm_id_lock);
+       return id;
+}
+
+static inline int avic_free_vm_id(int id)
+{
+       if (id <= 0 || id > AVIC_VM_ID_MASK)
+               return -EINVAL;
+
+       spin_lock(&avic_vm_id_lock);
+       __clear_bit(id, avic_vm_id_bitmap);
+       spin_unlock(&avic_vm_id_lock);
+       return 0;
+}
+
 static void avic_vm_destroy(struct kvm *kvm)
 {
+       unsigned long flags;
        struct kvm_arch *vm_data = &kvm->arch;
 
+       avic_free_vm_id(vm_data->avic_vm_id);
+
        if (vm_data->avic_logical_id_table_page)
                __free_page(vm_data->avic_logical_id_table_page);
        if (vm_data->avic_physical_id_table_page)
                __free_page(vm_data->avic_physical_id_table_page);
+
+       spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+       hash_del(&vm_data->hnode);
+       spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
 }
 
 static int avic_vm_init(struct kvm *kvm)
 {
-       int err = -ENOMEM;
+       unsigned long flags;
+       int vm_id, err = -ENOMEM;
        struct kvm_arch *vm_data = &kvm->arch;
        struct page *p_page;
        struct page *l_page;
@@ -1300,6 +1420,11 @@ static int avic_vm_init(struct kvm *kvm)
        if (!avic)
                return 0;
 
+       vm_id = avic_get_next_vm_id();
+       if (vm_id < 0)
+               return vm_id;
+       vm_data->avic_vm_id = (u32)vm_id;
+
        /* Allocating physical APIC ID table (4KB) */
        p_page = alloc_page(GFP_KERNEL);
        if (!p_page)
@@ -1316,6 +1441,10 @@ static int avic_vm_init(struct kvm *kvm)
        vm_data->avic_logical_id_table_page = l_page;
        clear_page(page_address(l_page));
 
+       spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+       hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id);
+       spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
        return 0;
 
 free_avic:
@@ -1323,31 +1452,34 @@ free_avic:
        return err;
 }
 
-/**
- * This function is called during VCPU halt/unhalt.
- */
-static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+static inline int
+avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
 {
-       u64 entry;
-       int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu);
+       int ret = 0;
+       unsigned long flags;
+       struct amd_svm_iommu_ir *ir;
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       if (!kvm_vcpu_apicv_active(vcpu))
-               return;
-
-       svm->avic_is_running = is_run;
+       if (!kvm_arch_has_assigned_device(vcpu->kvm))
+               return 0;
 
-       /* ID = 0xff (broadcast), ID > 0xff (reserved) */
-       if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
-               return;
+       /*
+        * Here, we go through the per-vcpu ir_list to update all existing
+        * interrupt remapping table entry targeting this vcpu.
+        */
+       spin_lock_irqsave(&svm->ir_list_lock, flags);
 
-       entry = READ_ONCE(*(svm->avic_physical_id_cache));
-       WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
+       if (list_empty(&svm->ir_list))
+               goto out;
 
-       entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-       if (is_run)
-               entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-       WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+       list_for_each_entry(ir, &svm->ir_list, node) {
+               ret = amd_iommu_update_ga(cpu, r, ir->data);
+               if (ret)
+                       break;
+       }
+out:
+       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+       return ret;
 }
 
 static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -1374,6 +1506,8 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
 
        WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+       avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
+                                       svm->avic_is_running);
 }
 
 static void avic_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1385,10 +1519,27 @@ static void avic_vcpu_put(struct kvm_vcpu *vcpu)
                return;
 
        entry = READ_ONCE(*(svm->avic_physical_id_cache));
+       if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+               avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+
        entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
        WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
 }
 
+/**
+ * This function is called during VCPU halt/unhalt.
+ */
+static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       svm->avic_is_running = is_run;
+       if (is_run)
+               avic_vcpu_load(vcpu, vcpu->cpu);
+       else
+               avic_vcpu_put(vcpu);
+}
+
 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -1450,6 +1601,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
                err = avic_init_backing_page(&svm->vcpu);
                if (err)
                        goto free_page4;
+
+               INIT_LIST_HEAD(&svm->ir_list);
+               spin_lock_init(&svm->ir_list_lock);
        }
 
        /* We initialize this flag to true to make sure that the is_running
@@ -4246,6 +4400,209 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
                kvm_vcpu_wake_up(vcpu);
 }
 
+static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+       unsigned long flags;
+       struct amd_svm_iommu_ir *cur;
+
+       spin_lock_irqsave(&svm->ir_list_lock, flags);
+       list_for_each_entry(cur, &svm->ir_list, node) {
+               if (cur->data != pi->ir_data)
+                       continue;
+               list_del(&cur->node);
+               kfree(cur);
+               break;
+       }
+       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+       int ret = 0;
+       unsigned long flags;
+       struct amd_svm_iommu_ir *ir;
+
+       /**
+        * In some cases, the existing irte is updaed and re-set,
+        * so we need to check here if it's already been * added
+        * to the ir_list.
+        */
+       if (pi->ir_data && (pi->prev_ga_tag != 0)) {
+               struct kvm *kvm = svm->vcpu.kvm;
+               u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
+               struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+               struct vcpu_svm *prev_svm;
+
+               if (!prev_vcpu) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               prev_svm = to_svm(prev_vcpu);
+               svm_ir_list_del(prev_svm, pi);
+       }
+
+       /**
+        * Allocating new amd_iommu_pi_data, which will get
+        * add to the per-vcpu ir_list.
+        */
+       ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL);
+       if (!ir) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       ir->data = pi->ir_data;
+
+       spin_lock_irqsave(&svm->ir_list_lock, flags);
+       list_add(&ir->node, &svm->ir_list);
+       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+out:
+       return ret;
+}
+
+/**
+ * Note:
+ * The HW cannot support posting multicast/broadcast
+ * interrupts to a vCPU. So, we still use legacy interrupt
+ * remapping for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ */
+static int
+get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
+                struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
+{
+       struct kvm_lapic_irq irq;
+       struct kvm_vcpu *vcpu = NULL;
+
+       kvm_set_msi_irq(kvm, e, &irq);
+
+       if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+               pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
+                        __func__, irq.vector);
+               return -1;
+       }
+
+       pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
+                irq.vector);
+       *svm = to_svm(vcpu);
+       vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page);
+       vcpu_info->vector = irq.vector;
+
+       return 0;
+}
+
+/*
+ * svm_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+                             uint32_t guest_irq, bool set)
+{
+       struct kvm_kernel_irq_routing_entry *e;
+       struct kvm_irq_routing_table *irq_rt;
+       int idx, ret = -EINVAL;
+
+       if (!kvm_arch_has_assigned_device(kvm) ||
+           !irq_remapping_cap(IRQ_POSTING_CAP))
+               return 0;
+
+       pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
+                __func__, host_irq, guest_irq, set);
+
+       idx = srcu_read_lock(&kvm->irq_srcu);
+       irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+       WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+       hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+               struct vcpu_data vcpu_info;
+               struct vcpu_svm *svm = NULL;
+
+               if (e->type != KVM_IRQ_ROUTING_MSI)
+                       continue;
+
+               /**
+                * Here, we setup with legacy mode in the following cases:
+                * 1. When cannot target interrupt to a specific vcpu.
+                * 2. Unsetting posted interrupt.
+                * 3. APIC virtialization is disabled for the vcpu.
+                */
+               if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
+                   kvm_vcpu_apicv_active(&svm->vcpu)) {
+                       struct amd_iommu_pi_data pi;
+
+                       /* Try to enable guest_mode in IRTE */
+                       pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK;
+                       pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
+                                                    svm->vcpu.vcpu_id);
+                       pi.is_guest_mode = true;
+                       pi.vcpu_data = &vcpu_info;
+                       ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+                       /**
+                        * Here, we successfully setting up vcpu affinity in
+                        * IOMMU guest mode. Now, we need to store the posted
+                        * interrupt information in a per-vcpu ir_list so that
+                        * we can reference to them directly when we update vcpu
+                        * scheduling information in IOMMU irte.
+                        */
+                       if (!ret && pi.is_guest_mode)
+                               svm_ir_list_add(svm, &pi);
+               } else {
+                       /* Use legacy mode in IRTE */
+                       struct amd_iommu_pi_data pi;
+
+                       /**
+                        * Here, pi is used to:
+                        * - Tell IOMMU to use legacy mode for this interrupt.
+                        * - Retrieve ga_tag of prior interrupt remapping data.
+                        */
+                       pi.is_guest_mode = false;
+                       ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+                       /**
+                        * Check if the posted interrupt was previously
+                        * setup with the guest_mode by checking if the ga_tag
+                        * was cached. If so, we need to clean up the per-vcpu
+                        * ir_list.
+                        */
+                       if (!ret && pi.prev_ga_tag) {
+                               int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
+                               struct kvm_vcpu *vcpu;
+
+                               vcpu = kvm_get_vcpu_by_id(kvm, id);
+                               if (vcpu)
+                                       svm_ir_list_del(to_svm(vcpu), &pi);
+                       }
+               }
+
+               if (!ret && svm) {
+                       trace_kvm_pi_irte_update(svm->vcpu.vcpu_id,
+                                                host_irq, e->gsi,
+                                                vcpu_info.vector,
+                                                vcpu_info.pi_desc_addr, set);
+               }
+
+               if (ret < 0) {
+                       pr_err("%s: failed to update PI IRTE\n", __func__);
+                       goto out;
+               }
+       }
+
+       ret = 0;
+out:
+       srcu_read_unlock(&kvm->irq_srcu, idx);
+       return ret;
+}
+
 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -5064,7 +5421,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 
        .has_wbinvd_exit = svm_has_wbinvd_exit,
 
-       .read_tsc_offset = svm_read_tsc_offset,
        .write_tsc_offset = svm_write_tsc_offset,
        .adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest,
        .read_l1_tsc = svm_read_l1_tsc,
@@ -5078,6 +5434,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 
        .pmu_ops = &amd_pmu_ops,
        .deliver_posted_interrupt = svm_deliver_avic_intr,
+       .update_pi_irte = svm_update_pi_irte,
 };
 
 static int __init svm_init(void)
index 121fdf6..cf1b16d 100644 (file)
@@ -927,6 +927,8 @@ static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
 static unsigned long *vmx_msr_bitmap_legacy_x2apic;
 static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
 static unsigned long *vmx_vmread_bitmap;
 static unsigned long *vmx_vmwrite_bitmap;
 
@@ -939,6 +941,7 @@ static DEFINE_SPINLOCK(vmx_vpid_lock);
 static struct vmcs_config {
        int size;
        int order;
+       u32 basic_cap;
        u32 revision_id;
        u32 pin_based_exec_ctrl;
        u32 cpu_based_exec_ctrl;
@@ -1215,6 +1218,11 @@ static inline bool cpu_has_vmx_ple(void)
                SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 }
 
+static inline bool cpu_has_vmx_basic_inout(void)
+{
+       return  (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
+}
+
 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
 {
        return flexpriority_enabled && lapic_in_kernel(vcpu);
@@ -2518,10 +2526,17 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
        else if (cpu_has_secondary_exec_ctrls() &&
                 (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
                  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
-               if (is_long_mode(vcpu))
-                       msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
-               else
-                       msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+               if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
+                       if (is_long_mode(vcpu))
+                               msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
+                       else
+                               msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+               } else {
+                       if (is_long_mode(vcpu))
+                               msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
+                       else
+                               msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+               }
        } else {
                if (is_long_mode(vcpu))
                        msr_bitmap = vmx_msr_bitmap_longmode;
@@ -2603,11 +2618,6 @@ static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
        return host_tsc + tsc_offset;
 }
 
-static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
-{
-       return vmcs_read64(TSC_OFFSET);
-}
-
 /*
  * writes 'offset' into guest's timestamp counter offset register
  */
@@ -2877,6 +2887,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
                *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
                           ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
                           (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+               if (cpu_has_vmx_basic_inout())
+                       *pdata |= VMX_BASIC_INOUT;
                break;
        case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
        case MSR_IA32_VMX_PINBASED_CTLS:
@@ -3457,7 +3469,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                return -EIO;
 
        vmcs_conf->size = vmx_msr_high & 0x1fff;
-       vmcs_conf->order = get_order(vmcs_config.size);
+       vmcs_conf->order = get_order(vmcs_conf->size);
+       vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
        vmcs_conf->revision_id = vmx_msr_low;
 
        vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
@@ -4678,28 +4691,49 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
                                                msr, MSR_TYPE_R | MSR_TYPE_W);
 }
 
-static void vmx_enable_intercept_msr_read_x2apic(u32 msr)
+static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
 {
-       __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
-                       msr, MSR_TYPE_R);
-       __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
-                       msr, MSR_TYPE_R);
+       if (apicv_active) {
+               __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+                               msr, MSR_TYPE_R);
+               __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+                               msr, MSR_TYPE_R);
+       } else {
+               __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+                               msr, MSR_TYPE_R);
+               __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+                               msr, MSR_TYPE_R);
+       }
 }
 
-static void vmx_disable_intercept_msr_read_x2apic(u32 msr)
+static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
 {
-       __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
-                       msr, MSR_TYPE_R);
-       __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
-                       msr, MSR_TYPE_R);
+       if (apicv_active) {
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+                               msr, MSR_TYPE_R);
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+                               msr, MSR_TYPE_R);
+       } else {
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+                               msr, MSR_TYPE_R);
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+                               msr, MSR_TYPE_R);
+       }
 }
 
-static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
+static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active)
 {
-       __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
-                       msr, MSR_TYPE_W);
-       __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
-                       msr, MSR_TYPE_W);
+       if (apicv_active) {
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+                               msr, MSR_TYPE_W);
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+                               msr, MSR_TYPE_W);
+       } else {
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+                               msr, MSR_TYPE_W);
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+                               msr, MSR_TYPE_W);
+       }
 }
 
 static bool vmx_get_enable_apicv(void)
@@ -5279,29 +5313,30 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-       if (is_guest_mode(vcpu))
-               return;
+       if (!is_guest_mode(vcpu)) {
+               if (!cpu_has_virtual_nmis()) {
+                       /*
+                        * Tracking the NMI-blocked state in software is built upon
+                        * finding the next open IRQ window. This, in turn, depends on
+                        * well-behaving guests: They have to keep IRQs disabled at
+                        * least as long as the NMI handler runs. Otherwise we may
+                        * cause NMI nesting, maybe breaking the guest. But as this is
+                        * highly unlikely, we can live with the residual risk.
+                        */
+                       vmx->soft_vnmi_blocked = 1;
+                       vmx->vnmi_blocked_time = 0;
+               }
 
-       if (!cpu_has_virtual_nmis()) {
-               /*
-                * Tracking the NMI-blocked state in software is built upon
-                * finding the next open IRQ window. This, in turn, depends on
-                * well-behaving guests: They have to keep IRQs disabled at
-                * least as long as the NMI handler runs. Otherwise we may
-                * cause NMI nesting, maybe breaking the guest. But as this is
-                * highly unlikely, we can live with the residual risk.
-                */
-               vmx->soft_vnmi_blocked = 1;
-               vmx->vnmi_blocked_time = 0;
+               ++vcpu->stat.nmi_injections;
+               vmx->nmi_known_unmasked = false;
        }
 
-       ++vcpu->stat.nmi_injections;
-       vmx->nmi_known_unmasked = false;
        if (vmx->rmode.vm86_active) {
                if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
                        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
                return;
        }
+
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
 }
@@ -6109,7 +6144,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
        gla_validity = (exit_qualification >> 7) & 0x3;
-       if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
+       if (gla_validity == 0x2) {
                printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
                printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
                        (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
@@ -6360,22 +6395,32 @@ static __init int hardware_setup(void)
        if (!vmx_msr_bitmap_legacy_x2apic)
                goto out2;
 
+       vmx_msr_bitmap_legacy_x2apic_apicv_inactive =
+                               (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive)
+               goto out3;
+
        vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
        if (!vmx_msr_bitmap_longmode)
-               goto out3;
+               goto out4;
 
        vmx_msr_bitmap_longmode_x2apic =
                                (unsigned long *)__get_free_page(GFP_KERNEL);
        if (!vmx_msr_bitmap_longmode_x2apic)
-               goto out4;
+               goto out5;
+
+       vmx_msr_bitmap_longmode_x2apic_apicv_inactive =
+                               (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive)
+               goto out6;
 
        vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
        if (!vmx_vmread_bitmap)
-               goto out6;
+               goto out7;
 
        vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
        if (!vmx_vmwrite_bitmap)
-               goto out7;
+               goto out8;
 
        memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
        memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -6394,7 +6439,7 @@ static __init int hardware_setup(void)
 
        if (setup_vmcs_config(&vmcs_config) < 0) {
                r = -EIO;
-               goto out8;
+               goto out9;
        }
 
        if (boot_cpu_has(X86_FEATURE_NX))
@@ -6461,20 +6506,35 @@ static __init int hardware_setup(void)
                        vmx_msr_bitmap_legacy, PAGE_SIZE);
        memcpy(vmx_msr_bitmap_longmode_x2apic,
                        vmx_msr_bitmap_longmode, PAGE_SIZE);
+       memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+                       vmx_msr_bitmap_legacy, PAGE_SIZE);
+       memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+                       vmx_msr_bitmap_longmode, PAGE_SIZE);
 
        set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
+       /*
+        * enable_apicv && kvm_vcpu_apicv_active()
+        */
        for (msr = 0x800; msr <= 0x8ff; msr++)
-               vmx_disable_intercept_msr_read_x2apic(msr);
+               vmx_disable_intercept_msr_read_x2apic(msr, true);
 
        /* TMCCT */
-       vmx_enable_intercept_msr_read_x2apic(0x839);
+       vmx_enable_intercept_msr_read_x2apic(0x839, true);
        /* TPR */
-       vmx_disable_intercept_msr_write_x2apic(0x808);
+       vmx_disable_intercept_msr_write_x2apic(0x808, true);
        /* EOI */
-       vmx_disable_intercept_msr_write_x2apic(0x80b);
+       vmx_disable_intercept_msr_write_x2apic(0x80b, true);
        /* SELF-IPI */
-       vmx_disable_intercept_msr_write_x2apic(0x83f);
+       vmx_disable_intercept_msr_write_x2apic(0x83f, true);
+
+       /*
+        * (enable_apicv && !kvm_vcpu_apicv_active()) ||
+        *      !enable_apicv
+        */
+       /* TPR */
+       vmx_disable_intercept_msr_read_x2apic(0x808, false);
+       vmx_disable_intercept_msr_write_x2apic(0x808, false);
 
        if (enable_ept) {
                kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
@@ -6521,14 +6581,18 @@ static __init int hardware_setup(void)
 
        return alloc_kvm_area();
 
-out8:
+out9:
        free_page((unsigned long)vmx_vmwrite_bitmap);
-out7:
+out8:
        free_page((unsigned long)vmx_vmread_bitmap);
+out7:
+       free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
 out6:
        free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
-out4:
+out5:
        free_page((unsigned long)vmx_msr_bitmap_longmode);
+out4:
+       free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
 out3:
        free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
 out2:
@@ -6544,7 +6608,9 @@ out:
 static __exit void hardware_unsetup(void)
 {
        free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+       free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
        free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
+       free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
        free_page((unsigned long)vmx_msr_bitmap_legacy);
        free_page((unsigned long)vmx_msr_bitmap_longmode);
        free_page((unsigned long)vmx_io_bitmap_b);
@@ -6726,7 +6792,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
 {
        /* TODO: not to reset guest simply here. */
        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
-       pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
+       pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
 }
 
 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
@@ -7013,7 +7079,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
        vmx->nested.vmcs02_num = 0;
 
        hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
-                    HRTIMER_MODE_REL);
+                    HRTIMER_MODE_REL_PINNED);
        vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
 
        vmx->nested.vmxon = true;
@@ -8435,12 +8501,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
                return;
        }
 
-       /*
-        * There is not point to enable virtualize x2apic without enable
-        * apicv
-        */
-       if (!cpu_has_vmx_virtualize_x2apic_mode() ||
-                               !kvm_vcpu_apicv_active(vcpu))
+       if (!cpu_has_vmx_virtualize_x2apic_mode())
                return;
 
        if (!cpu_need_tpr_shadow(vcpu))
@@ -9598,7 +9659,7 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
        maxphyaddr = cpuid_maxphyaddr(vcpu);
        if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
            (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
-               pr_warn_ratelimited(
+               pr_debug_ratelimited(
                        "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
                        addr_field, maxphyaddr, count, addr);
                return -EINVAL;
@@ -9671,13 +9732,13 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
        for (i = 0; i < count; i++) {
                if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
                                        &e, sizeof(e))) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                "%s cannot read MSR entry (%u, 0x%08llx)\n",
                                __func__, i, gpa + i * sizeof(e));
                        goto fail;
                }
                if (nested_vmx_load_msr_check(vcpu, &e)) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                "%s check failed (%u, 0x%x, 0x%x)\n",
                                __func__, i, e.index, e.reserved);
                        goto fail;
@@ -9685,7 +9746,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
                msr.index = e.index;
                msr.data = e.value;
                if (kvm_set_msr(vcpu, &msr)) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
                                __func__, i, e.index, e.value);
                        goto fail;
@@ -9706,13 +9767,13 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
                if (kvm_vcpu_read_guest(vcpu,
                                        gpa + i * sizeof(e),
                                        &e, 2 * sizeof(u32))) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                "%s cannot read MSR entry (%u, 0x%08llx)\n",
                                __func__, i, gpa + i * sizeof(e));
                        return -EINVAL;
                }
                if (nested_vmx_store_msr_check(vcpu, &e)) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                "%s check failed (%u, 0x%x, 0x%x)\n",
                                __func__, i, e.index, e.reserved);
                        return -EINVAL;
@@ -9720,7 +9781,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
                msr_info.host_initiated = false;
                msr_info.index = e.index;
                if (kvm_get_msr(vcpu, &msr_info)) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                "%s cannot read MSR (%u, 0x%x)\n",
                                __func__, i, e.index);
                        return -EINVAL;
@@ -9729,7 +9790,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
                                         gpa + i * sizeof(e) +
                                             offsetof(struct vmx_msr_entry, value),
                                         &msr_info.data, sizeof(msr_info.data))) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
                                __func__, i, e.index, msr_info.data);
                        return -EINVAL;
@@ -10500,6 +10561,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
        }
 
+       if (nested_cpu_has_ept(vmcs12))
+               vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+
        if (nested_cpu_has_vid(vmcs12))
                vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
 
@@ -10793,7 +10857,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
         * We are now running in L2, mmu_notifier will force to reload the
         * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
         */
-       kvm_vcpu_reload_apic_access_page(vcpu);
+       kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
 
        /*
         * Exiting from L2 to L1, we're now back to L1 which thinks it just
@@ -11274,7 +11338,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
        .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
-       .read_tsc_offset = vmx_read_tsc_offset,
        .write_tsc_offset = vmx_write_tsc_offset,
        .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest,
        .read_l1_tsc = vmx_read_l1_tsc,
index 699f872..6c633de 100644 (file)
@@ -1367,7 +1367,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
 
 static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
 {
-       u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
+       u64 curr_offset = vcpu->arch.tsc_offset;
        vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
 }
 
@@ -1413,6 +1413,12 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 }
 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
 
+static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+       kvm_x86_ops->write_tsc_offset(vcpu, offset);
+       vcpu->arch.tsc_offset = offset;
+}
+
 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
        struct kvm *kvm = vcpu->kvm;
@@ -1425,7 +1431,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 
        raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
        offset = kvm_compute_tsc_offset(vcpu, data);
-       ns = get_kernel_ns();
+       ns = ktime_get_boot_ns();
        elapsed = ns - kvm->arch.last_tsc_nsec;
 
        if (vcpu->arch.virtual_tsc_khz) {
@@ -1522,7 +1528,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 
        if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
                update_ia32_tsc_adjust_msr(vcpu, offset);
-       kvm_x86_ops->write_tsc_offset(vcpu, offset);
+       kvm_vcpu_write_tsc_offset(vcpu, offset);
        raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
        spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
@@ -1716,6 +1722,88 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
+static u64 __get_kvmclock_ns(struct kvm *kvm)
+{
+       struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0);
+       struct kvm_arch *ka = &kvm->arch;
+       s64 ns;
+
+       if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) {
+               u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+               ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc);
+       } else {
+               ns = ktime_get_boot_ns() + ka->kvmclock_offset;
+       }
+
+       return ns;
+}
+
+u64 get_kvmclock_ns(struct kvm *kvm)
+{
+       unsigned long flags;
+       s64 ns;
+
+       local_irq_save(flags);
+       ns = __get_kvmclock_ns(kvm);
+       local_irq_restore(flags);
+
+       return ns;
+}
+
+static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
+{
+       struct kvm_vcpu_arch *vcpu = &v->arch;
+       struct pvclock_vcpu_time_info guest_hv_clock;
+
+       if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+               &guest_hv_clock, sizeof(guest_hv_clock))))
+               return;
+
+       /* This VCPU is paused, but it's legal for a guest to read another
+        * VCPU's kvmclock, so we really have to follow the specification where
+        * it says that version is odd if data is being modified, and even after
+        * it is consistent.
+        *
+        * Version field updates must be kept separate.  This is because
+        * kvm_write_guest_cached might use a "rep movs" instruction, and
+        * writes within a string instruction are weakly ordered.  So there
+        * are three writes overall.
+        *
+        * As a small optimization, only write the version field in the first
+        * and third write.  The vcpu->pv_time cache is still valid, because the
+        * version field is the first in the struct.
+        */
+       BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+       vcpu->hv_clock.version = guest_hv_clock.version + 1;
+       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+                               &vcpu->hv_clock,
+                               sizeof(vcpu->hv_clock.version));
+
+       smp_wmb();
+
+       /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+       vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+
+       if (vcpu->pvclock_set_guest_stopped_request) {
+               vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
+               vcpu->pvclock_set_guest_stopped_request = false;
+       }
+
+       trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+
+       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+                               &vcpu->hv_clock,
+                               sizeof(vcpu->hv_clock));
+
+       smp_wmb();
+
+       vcpu->hv_clock.version++;
+       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+                               &vcpu->hv_clock,
+                               sizeof(vcpu->hv_clock.version));
+}
+
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
        unsigned long flags, tgt_tsc_khz;
@@ -1723,7 +1811,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        struct kvm_arch *ka = &v->kvm->arch;
        s64 kernel_ns;
        u64 tsc_timestamp, host_tsc;
-       struct pvclock_vcpu_time_info guest_hv_clock;
        u8 pvclock_flags;
        bool use_master_clock;
 
@@ -1752,7 +1839,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        }
        if (!use_master_clock) {
                host_tsc = rdtsc();
-               kernel_ns = get_kernel_ns();
+               kernel_ns = ktime_get_boot_ns();
        }
 
        tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
@@ -1777,8 +1864,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
        local_irq_restore(flags);
 
-       if (!vcpu->pv_time_enabled)
-               return 0;
+       /* With all the info we got, fill in the values */
 
        if (kvm_has_tsc_control)
                tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
@@ -1790,64 +1876,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
                vcpu->hw_tsc_khz = tgt_tsc_khz;
        }
 
-       /* With all the info we got, fill in the values */
        vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
        vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
        vcpu->last_guest_tsc = tsc_timestamp;
 
-       if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
-               &guest_hv_clock, sizeof(guest_hv_clock))))
-               return 0;
-
-       /* This VCPU is paused, but it's legal for a guest to read another
-        * VCPU's kvmclock, so we really have to follow the specification where
-        * it says that version is odd if data is being modified, and even after
-        * it is consistent.
-        *
-        * Version field updates must be kept separate.  This is because
-        * kvm_write_guest_cached might use a "rep movs" instruction, and
-        * writes within a string instruction are weakly ordered.  So there
-        * are three writes overall.
-        *
-        * As a small optimization, only write the version field in the first
-        * and third write.  The vcpu->pv_time cache is still valid, because the
-        * version field is the first in the struct.
-        */
-       BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
-
-       vcpu->hv_clock.version = guest_hv_clock.version + 1;
-       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-                               &vcpu->hv_clock,
-                               sizeof(vcpu->hv_clock.version));
-
-       smp_wmb();
-
-       /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
-       pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
-
-       if (vcpu->pvclock_set_guest_stopped_request) {
-               pvclock_flags |= PVCLOCK_GUEST_STOPPED;
-               vcpu->pvclock_set_guest_stopped_request = false;
-       }
-
        /* If the host uses TSC clocksource, then it is stable */
+       pvclock_flags = 0;
        if (use_master_clock)
                pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
 
        vcpu->hv_clock.flags = pvclock_flags;
 
-       trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
-
-       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-                               &vcpu->hv_clock,
-                               sizeof(vcpu->hv_clock));
-
-       smp_wmb();
-
-       vcpu->hv_clock.version++;
-       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-                               &vcpu->hv_clock,
-                               sizeof(vcpu->hv_clock.version));
+       if (vcpu->pv_time_enabled)
+               kvm_setup_pvclock_page(v);
+       if (v == kvm_get_vcpu(v->kvm, 0))
+               kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
        return 0;
 }
 
@@ -2746,7 +2789,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                if (check_tsc_unstable()) {
                        u64 offset = kvm_compute_tsc_offset(vcpu,
                                                vcpu->arch.last_guest_tsc);
-                       kvm_x86_ops->write_tsc_offset(vcpu, offset);
+                       kvm_vcpu_write_tsc_offset(vcpu, offset);
                        vcpu->arch.tsc_catchup = 1;
                }
                if (kvm_lapic_hv_timer_in_use(vcpu) &&
@@ -4039,7 +4082,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
        case KVM_SET_CLOCK: {
                struct kvm_clock_data user_ns;
                u64 now_ns;
-               s64 delta;
 
                r = -EFAULT;
                if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
@@ -4051,10 +4093,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
                r = 0;
                local_irq_disable();
-               now_ns = get_kernel_ns();
-               delta = user_ns.clock - now_ns;
+               now_ns = __get_kvmclock_ns(kvm);
+               kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
                local_irq_enable();
-               kvm->arch.kvmclock_offset = delta;
                kvm_gen_update_masterclock(kvm);
                break;
        }
@@ -4062,10 +4103,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
                struct kvm_clock_data user_ns;
                u64 now_ns;
 
-               local_irq_disable();
-               now_ns = get_kernel_ns();
-               user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
-               local_irq_enable();
+               now_ns = get_kvmclock_ns(kvm);
+               user_ns.clock = now_ns;
                user_ns.flags = 0;
                memset(&user_ns.pad, 0, sizeof(user_ns.pad));
 
@@ -6700,7 +6739,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
        kvm_put_guest_xcr0(vcpu);
 
-       /* Interrupt is enabled by handle_external_intr() */
        kvm_x86_ops->handle_external_intr(vcpu);
 
        ++vcpu->stat.exits;
@@ -7530,7 +7568,7 @@ int kvm_arch_hardware_enable(void)
         * before any KVM threads can be running.  Unfortunately, we can't
         * bring the TSCs fully up to date with real time, as we aren't yet far
         * enough into CPU bringup that we know how much real time has actually
-        * elapsed; our helper function, get_kernel_ns() will be using boot
+        * elapsed; our helper function, ktime_get_boot_ns() will be using boot
         * variables that haven't been updated yet.
         *
         * So we simply find the maximum observed TSC above, then record the
@@ -7765,6 +7803,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        mutex_init(&kvm->arch.apic_map_lock);
        spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
 
+       kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
        pvclock_update_vm_gtod_copy(kvm);
 
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
index a82ca46..e8ff3e4 100644 (file)
@@ -148,11 +148,6 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
        return kvm_register_write(vcpu, reg, val);
 }
 
-static inline u64 get_kernel_ns(void)
-{
-       return ktime_get_boot_ns();
-}
-
 static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
 {
        return !(kvm->arch.disabled_quirks & quirk);
@@ -164,6 +159,7 @@ void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
 
 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
+u64 get_kvmclock_ns(struct kvm *kvm);
 
 int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
        gva_t addr, void *val, unsigned int bytes,
index 4025291..58fa8cc 100644 (file)
@@ -137,6 +137,7 @@ struct iommu_dev_data {
        bool pri_tlp;                     /* PASID TLB required for
                                             PPR completions */
        u32 errata;                       /* Bitmap for errata to apply */
+       bool use_vapic;                   /* Enable device to use vapic mode */
 };
 
 /*
@@ -707,14 +708,74 @@ static void iommu_poll_ppr_log(struct amd_iommu *iommu)
        }
 }
 
+#ifdef CONFIG_IRQ_REMAP
+static int (*iommu_ga_log_notifier)(u32);
+
+int amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
+{
+       iommu_ga_log_notifier = notifier;
+
+       return 0;
+}
+EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier);
+
+static void iommu_poll_ga_log(struct amd_iommu *iommu)
+{
+       u32 head, tail, cnt = 0;
+
+       if (iommu->ga_log == NULL)
+               return;
+
+       head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
+       tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
+
+       while (head != tail) {
+               volatile u64 *raw;
+               u64 log_entry;
+
+               raw = (u64 *)(iommu->ga_log + head);
+               cnt++;
+
+               /* Avoid memcpy function-call overhead */
+               log_entry = *raw;
+
+               /* Update head pointer of hardware ring-buffer */
+               head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE;
+               writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
+
+               /* Handle GA entry */
+               switch (GA_REQ_TYPE(log_entry)) {
+               case GA_GUEST_NR:
+                       if (!iommu_ga_log_notifier)
+                               break;
+
+                       pr_debug("AMD-Vi: %s: devid=%#x, ga_tag=%#x\n",
+                                __func__, GA_DEVID(log_entry),
+                                GA_TAG(log_entry));
+
+                       if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0)
+                               pr_err("AMD-Vi: GA log notifier failed.\n");
+                       break;
+               default:
+                       break;
+               }
+       }
+}
+#endif /* CONFIG_IRQ_REMAP */
+
+#define AMD_IOMMU_INT_MASK     \
+       (MMIO_STATUS_EVT_INT_MASK | \
+        MMIO_STATUS_PPR_INT_MASK | \
+        MMIO_STATUS_GALOG_INT_MASK)
+
 irqreturn_t amd_iommu_int_thread(int irq, void *data)
 {
        struct amd_iommu *iommu = (struct amd_iommu *) data;
        u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
 
-       while (status & (MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK)) {
-               /* Enable EVT and PPR interrupts again */
-               writel((MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK),
+       while (status & AMD_IOMMU_INT_MASK) {
+               /* Enable EVT and PPR and GA interrupts again */
+               writel(AMD_IOMMU_INT_MASK,
                        iommu->mmio_base + MMIO_STATUS_OFFSET);
 
                if (status & MMIO_STATUS_EVT_INT_MASK) {
@@ -727,6 +788,13 @@ irqreturn_t amd_iommu_int_thread(int irq, void *data)
                        iommu_poll_ppr_log(iommu);
                }
 
+#ifdef CONFIG_IRQ_REMAP
+               if (status & MMIO_STATUS_GALOG_INT_MASK) {
+                       pr_devel("AMD-Vi: Processing IOMMU GA Log\n");
+                       iommu_poll_ga_log(iommu);
+               }
+#endif
+
                /*
                 * Hardware bug: ERBT1312
                 * When re-enabling interrupt (by writing 1
@@ -2967,6 +3035,12 @@ static void amd_iommu_detach_device(struct iommu_domain *dom,
        if (!iommu)
                return;
 
+#ifdef CONFIG_IRQ_REMAP
+       if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) &&
+           (dom->type == IOMMU_DOMAIN_UNMANAGED))
+               dev_data->use_vapic = 0;
+#endif
+
        iommu_completion_wait(iommu);
 }
 
@@ -2992,6 +3066,15 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
 
        ret = attach_device(dev, domain);
 
+#ifdef CONFIG_IRQ_REMAP
+       if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
+               if (dom->type == IOMMU_DOMAIN_UNMANAGED)
+                       dev_data->use_vapic = 1;
+               else
+                       dev_data->use_vapic = 0;
+       }
+#endif
+
        iommu_completion_wait(iommu);
 
        return ret;
@@ -3530,34 +3613,6 @@ EXPORT_SYMBOL(amd_iommu_device_info);
  *
  *****************************************************************************/
 
-union irte {
-       u32 val;
-       struct {
-               u32 valid       : 1,
-                   no_fault    : 1,
-                   int_type    : 3,
-                   rq_eoi      : 1,
-                   dm          : 1,
-                   rsvd_1      : 1,
-                   destination : 8,
-                   vector      : 8,
-                   rsvd_2      : 8;
-       } fields;
-};
-
-struct irq_2_irte {
-       u16 devid; /* Device ID for IRTE table */
-       u16 index; /* Index into IRTE table*/
-};
-
-struct amd_ir_data {
-       struct irq_2_irte                       irq_2_irte;
-       union irte                              irte_entry;
-       union {
-               struct msi_msg                  msi_entry;
-       };
-};
-
 static struct irq_chip amd_ir_chip;
 
 #define DTE_IRQ_PHYS_ADDR_MASK (((1ULL << 45)-1) << 6)
@@ -3579,8 +3634,6 @@ static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table)
        amd_iommu_dev_table[devid].data[2] = dte;
 }
 
-#define IRTE_ALLOCATED (~1U)
-
 static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic)
 {
        struct irq_remap_table *table = NULL;
@@ -3626,13 +3679,18 @@ static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic)
                goto out;
        }
 
-       memset(table->table, 0, MAX_IRQS_PER_TABLE * sizeof(u32));
+       if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
+               memset(table->table, 0,
+                      MAX_IRQS_PER_TABLE * sizeof(u32));
+       else
+               memset(table->table, 0,
+                      (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
 
        if (ioapic) {
                int i;
 
                for (i = 0; i < 32; ++i)
-                       table->table[i] = IRTE_ALLOCATED;
+                       iommu->irte_ops->set_allocated(table, i);
        }
 
        irq_lookup_table[devid] = table;
@@ -3658,6 +3716,10 @@ static int alloc_irq_index(u16 devid, int count)
        struct irq_remap_table *table;
        unsigned long flags;
        int index, c;
+       struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+       if (!iommu)
+               return -ENODEV;
 
        table = get_irq_table(devid, false);
        if (!table)
@@ -3669,14 +3731,14 @@ static int alloc_irq_index(u16 devid, int count)
        for (c = 0, index = table->min_index;
             index < MAX_IRQS_PER_TABLE;
             ++index) {
-               if (table->table[index] == 0)
+               if (!iommu->irte_ops->is_allocated(table, index))
                        c += 1;
                else
                        c = 0;
 
                if (c == count) {
                        for (; c != 0; --c)
-                               table->table[index - c + 1] = IRTE_ALLOCATED;
+                               iommu->irte_ops->set_allocated(table, index - c + 1);
 
                        index -= count - 1;
                        goto out;
@@ -3691,7 +3753,42 @@ out:
        return index;
 }
 
-static int modify_irte(u16 devid, int index, union irte irte)
+static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte,
+                         struct amd_ir_data *data)
+{
+       struct irq_remap_table *table;
+       struct amd_iommu *iommu;
+       unsigned long flags;
+       struct irte_ga *entry;
+
+       iommu = amd_iommu_rlookup_table[devid];
+       if (iommu == NULL)
+               return -EINVAL;
+
+       table = get_irq_table(devid, false);
+       if (!table)
+               return -ENOMEM;
+
+       spin_lock_irqsave(&table->lock, flags);
+
+       entry = (struct irte_ga *)table->table;
+       entry = &entry[index];
+       entry->lo.fields_remap.valid = 0;
+       entry->hi.val = irte->hi.val;
+       entry->lo.val = irte->lo.val;
+       entry->lo.fields_remap.valid = 1;
+       if (data)
+               data->ref = entry;
+
+       spin_unlock_irqrestore(&table->lock, flags);
+
+       iommu_flush_irt(iommu, devid);
+       iommu_completion_wait(iommu);
+
+       return 0;
+}
+
+static int modify_irte(u16 devid, int index, union irte *irte)
 {
        struct irq_remap_table *table;
        struct amd_iommu *iommu;
@@ -3706,7 +3803,7 @@ static int modify_irte(u16 devid, int index, union irte irte)
                return -ENOMEM;
 
        spin_lock_irqsave(&table->lock, flags);
-       table->table[index] = irte.val;
+       table->table[index] = irte->val;
        spin_unlock_irqrestore(&table->lock, flags);
 
        iommu_flush_irt(iommu, devid);
@@ -3730,13 +3827,146 @@ static void free_irte(u16 devid, int index)
                return;
 
        spin_lock_irqsave(&table->lock, flags);
-       table->table[index] = 0;
+       iommu->irte_ops->clear_allocated(table, index);
        spin_unlock_irqrestore(&table->lock, flags);
 
        iommu_flush_irt(iommu, devid);
        iommu_completion_wait(iommu);
 }
 
+static void irte_prepare(void *entry,
+                        u32 delivery_mode, u32 dest_mode,
+                        u8 vector, u32 dest_apicid, int devid)
+{
+       union irte *irte = (union irte *) entry;
+
+       irte->val                = 0;
+       irte->fields.vector      = vector;
+       irte->fields.int_type    = delivery_mode;
+       irte->fields.destination = dest_apicid;
+       irte->fields.dm          = dest_mode;
+       irte->fields.valid       = 1;
+}
+
+static void irte_ga_prepare(void *entry,
+                           u32 delivery_mode, u32 dest_mode,
+                           u8 vector, u32 dest_apicid, int devid)
+{
+       struct irte_ga *irte = (struct irte_ga *) entry;
+       struct iommu_dev_data *dev_data = search_dev_data(devid);
+
+       irte->lo.val                      = 0;
+       irte->hi.val                      = 0;
+       irte->lo.fields_remap.guest_mode  = dev_data ? dev_data->use_vapic : 0;
+       irte->lo.fields_remap.int_type    = delivery_mode;
+       irte->lo.fields_remap.dm          = dest_mode;
+       irte->hi.fields.vector            = vector;
+       irte->lo.fields_remap.destination = dest_apicid;
+       irte->lo.fields_remap.valid       = 1;
+}
+
+static void irte_activate(void *entry, u16 devid, u16 index)
+{
+       union irte *irte = (union irte *) entry;
+
+       irte->fields.valid = 1;
+       modify_irte(devid, index, irte);
+}
+
+static void irte_ga_activate(void *entry, u16 devid, u16 index)
+{
+       struct irte_ga *irte = (struct irte_ga *) entry;
+
+       irte->lo.fields_remap.valid = 1;
+       modify_irte_ga(devid, index, irte, NULL);
+}
+
+static void irte_deactivate(void *entry, u16 devid, u16 index)
+{
+       union irte *irte = (union irte *) entry;
+
+       irte->fields.valid = 0;
+       modify_irte(devid, index, irte);
+}
+
+static void irte_ga_deactivate(void *entry, u16 devid, u16 index)
+{
+       struct irte_ga *irte = (struct irte_ga *) entry;
+
+       irte->lo.fields_remap.valid = 0;
+       modify_irte_ga(devid, index, irte, NULL);
+}
+
+static void irte_set_affinity(void *entry, u16 devid, u16 index,
+                             u8 vector, u32 dest_apicid)
+{
+       union irte *irte = (union irte *) entry;
+
+       irte->fields.vector = vector;
+       irte->fields.destination = dest_apicid;
+       modify_irte(devid, index, irte);
+}
+
+static void irte_ga_set_affinity(void *entry, u16 devid, u16 index,
+                                u8 vector, u32 dest_apicid)
+{
+       struct irte_ga *irte = (struct irte_ga *) entry;
+       struct iommu_dev_data *dev_data = search_dev_data(devid);
+
+       if (!dev_data || !dev_data->use_vapic) {
+               irte->hi.fields.vector = vector;
+               irte->lo.fields_remap.destination = dest_apicid;
+               irte->lo.fields_remap.guest_mode = 0;
+               modify_irte_ga(devid, index, irte, NULL);
+       }
+}
+
+#define IRTE_ALLOCATED (~1U)
+static void irte_set_allocated(struct irq_remap_table *table, int index)
+{
+       table->table[index] = IRTE_ALLOCATED;
+}
+
+static void irte_ga_set_allocated(struct irq_remap_table *table, int index)
+{
+       struct irte_ga *ptr = (struct irte_ga *)table->table;
+       struct irte_ga *irte = &ptr[index];
+
+       memset(&irte->lo.val, 0, sizeof(u64));
+       memset(&irte->hi.val, 0, sizeof(u64));
+       irte->hi.fields.vector = 0xff;
+}
+
+static bool irte_is_allocated(struct irq_remap_table *table, int index)
+{
+       union irte *ptr = (union irte *)table->table;
+       union irte *irte = &ptr[index];
+
+       return irte->val != 0;
+}
+
+static bool irte_ga_is_allocated(struct irq_remap_table *table, int index)
+{
+       struct irte_ga *ptr = (struct irte_ga *)table->table;
+       struct irte_ga *irte = &ptr[index];
+
+       return irte->hi.fields.vector != 0;
+}
+
+static void irte_clear_allocated(struct irq_remap_table *table, int index)
+{
+       table->table[index] = 0;
+}
+
+static void irte_ga_clear_allocated(struct irq_remap_table *table, int index)
+{
+       struct irte_ga *ptr = (struct irte_ga *)table->table;
+       struct irte_ga *irte = &ptr[index];
+
+       memset(&irte->lo.val, 0, sizeof(u64));
+       memset(&irte->hi.val, 0, sizeof(u64));
+}
+
 static int get_devid(struct irq_alloc_info *info)
 {
        int devid = -1;
@@ -3821,19 +4051,17 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data,
 {
        struct irq_2_irte *irte_info = &data->irq_2_irte;
        struct msi_msg *msg = &data->msi_entry;
-       union irte *irte = &data->irte_entry;
        struct IO_APIC_route_entry *entry;
+       struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+       if (!iommu)
+               return;
 
        data->irq_2_irte.devid = devid;
        data->irq_2_irte.index = index + sub_handle;
-
-       /* Setup IRTE for IOMMU */
-       irte->val = 0;
-       irte->fields.vector      = irq_cfg->vector;
-       irte->fields.int_type    = apic->irq_delivery_mode;
-       irte->fields.destination = irq_cfg->dest_apicid;
-       irte->fields.dm          = apic->irq_dest_mode;
-       irte->fields.valid       = 1;
+       iommu->irte_ops->prepare(data->entry, apic->irq_delivery_mode,
+                                apic->irq_dest_mode, irq_cfg->vector,
+                                irq_cfg->dest_apicid, devid);
 
        switch (info->type) {
        case X86_IRQ_ALLOC_TYPE_IOAPIC:
@@ -3864,12 +4092,32 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data,
        }
 }
 
+struct amd_irte_ops irte_32_ops = {
+       .prepare = irte_prepare,
+       .activate = irte_activate,
+       .deactivate = irte_deactivate,
+       .set_affinity = irte_set_affinity,
+       .set_allocated = irte_set_allocated,
+       .is_allocated = irte_is_allocated,
+       .clear_allocated = irte_clear_allocated,
+};
+
+struct amd_irte_ops irte_128_ops = {
+       .prepare = irte_ga_prepare,
+       .activate = irte_ga_activate,
+       .deactivate = irte_ga_deactivate,
+       .set_affinity = irte_ga_set_affinity,
+       .set_allocated = irte_ga_set_allocated,
+       .is_allocated = irte_ga_is_allocated,
+       .clear_allocated = irte_ga_clear_allocated,
+};
+
 static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
                               unsigned int nr_irqs, void *arg)
 {
        struct irq_alloc_info *info = arg;
        struct irq_data *irq_data;
-       struct amd_ir_data *data;
+       struct amd_ir_data *data = NULL;
        struct irq_cfg *cfg;
        int i, ret, devid;
        int index = -1;
@@ -3921,6 +4169,16 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
                if (!data)
                        goto out_free_data;
 
+               if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
+                       data->entry = kzalloc(sizeof(union irte), GFP_KERNEL);
+               else
+                       data->entry = kzalloc(sizeof(struct irte_ga),
+                                                    GFP_KERNEL);
+               if (!data->entry) {
+                       kfree(data);
+                       goto out_free_data;
+               }
+
                irq_data->hwirq = (devid << 16) + i;
                irq_data->chip_data = data;
                irq_data->chip = &amd_ir_chip;
@@ -3957,6 +4215,7 @@ static void irq_remapping_free(struct irq_domain *domain, unsigned int virq,
                        data = irq_data->chip_data;
                        irte_info = &data->irq_2_irte;
                        free_irte(irte_info->devid, irte_info->index);
+                       kfree(data->entry);
                        kfree(data);
                }
        }
@@ -3968,8 +4227,11 @@ static void irq_remapping_activate(struct irq_domain *domain,
 {
        struct amd_ir_data *data = irq_data->chip_data;
        struct irq_2_irte *irte_info = &data->irq_2_irte;
+       struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
 
-       modify_irte(irte_info->devid, irte_info->index, data->irte_entry);
+       if (iommu)
+               iommu->irte_ops->activate(data->entry, irte_info->devid,
+                                         irte_info->index);
 }
 
 static void irq_remapping_deactivate(struct irq_domain *domain,
@@ -3977,10 +4239,11 @@ static void irq_remapping_deactivate(struct irq_domain *domain,
 {
        struct amd_ir_data *data = irq_data->chip_data;
        struct irq_2_irte *irte_info = &data->irq_2_irte;
-       union irte entry;
+       struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
 
-       entry.val = 0;
-       modify_irte(irte_info->devid, irte_info->index, data->irte_entry);
+       if (iommu)
+               iommu->irte_ops->deactivate(data->entry, irte_info->devid,
+                                           irte_info->index);
 }
 
 static struct irq_domain_ops amd_ir_domain_ops = {
@@ -3990,6 +4253,70 @@ static struct irq_domain_ops amd_ir_domain_ops = {
        .deactivate = irq_remapping_deactivate,
 };
 
+static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
+{
+       struct amd_iommu *iommu;
+       struct amd_iommu_pi_data *pi_data = vcpu_info;
+       struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
+       struct amd_ir_data *ir_data = data->chip_data;
+       struct irte_ga *irte = (struct irte_ga *) ir_data->entry;
+       struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
+       struct iommu_dev_data *dev_data = search_dev_data(irte_info->devid);
+
+       /* Note:
+        * This device has never been set up for guest mode.
+        * we should not modify the IRTE
+        */
+       if (!dev_data || !dev_data->use_vapic)
+               return 0;
+
+       pi_data->ir_data = ir_data;
+
+       /* Note:
+        * SVM tries to set up for VAPIC mode, but we are in
+        * legacy mode. So, we force legacy mode instead.
+        */
+       if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
+               pr_debug("AMD-Vi: %s: Fall back to using intr legacy remap\n",
+                        __func__);
+               pi_data->is_guest_mode = false;
+       }
+
+       iommu = amd_iommu_rlookup_table[irte_info->devid];
+       if (iommu == NULL)
+               return -EINVAL;
+
+       pi_data->prev_ga_tag = ir_data->cached_ga_tag;
+       if (pi_data->is_guest_mode) {
+               /* Setting */
+               irte->hi.fields.ga_root_ptr = (pi_data->base >> 12);
+               irte->hi.fields.vector = vcpu_pi_info->vector;
+               irte->lo.fields_vapic.guest_mode = 1;
+               irte->lo.fields_vapic.ga_tag = pi_data->ga_tag;
+
+               ir_data->cached_ga_tag = pi_data->ga_tag;
+       } else {
+               /* Un-Setting */
+               struct irq_cfg *cfg = irqd_cfg(data);
+
+               irte->hi.val = 0;
+               irte->lo.val = 0;
+               irte->hi.fields.vector = cfg->vector;
+               irte->lo.fields_remap.guest_mode = 0;
+               irte->lo.fields_remap.destination = cfg->dest_apicid;
+               irte->lo.fields_remap.int_type = apic->irq_delivery_mode;
+               irte->lo.fields_remap.dm = apic->irq_dest_mode;
+
+               /*
+                * This communicates the ga_tag back to the caller
+                * so that it can do all the necessary clean up.
+                */
+               ir_data->cached_ga_tag = 0;
+       }
+
+       return modify_irte_ga(irte_info->devid, irte_info->index, irte, ir_data);
+}
+
 static int amd_ir_set_affinity(struct irq_data *data,
                               const struct cpumask *mask, bool force)
 {
@@ -3997,8 +4324,12 @@ static int amd_ir_set_affinity(struct irq_data *data,
        struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
        struct irq_cfg *cfg = irqd_cfg(data);
        struct irq_data *parent = data->parent_data;
+       struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
        int ret;
 
+       if (!iommu)
+               return -ENODEV;
+
        ret = parent->chip->irq_set_affinity(parent, mask, force);
        if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
                return ret;
@@ -4007,9 +4338,8 @@ static int amd_ir_set_affinity(struct irq_data *data,
         * Atomically updates the IRTE with the new destination, vector
         * and flushes the interrupt entry cache.
         */
-       ir_data->irte_entry.fields.vector = cfg->vector;
-       ir_data->irte_entry.fields.destination = cfg->dest_apicid;
-       modify_irte(irte_info->devid, irte_info->index, ir_data->irte_entry);
+       iommu->irte_ops->set_affinity(ir_data->entry, irte_info->devid,
+                           irte_info->index, cfg->vector, cfg->dest_apicid);
 
        /*
         * After this point, all the interrupts will start arriving
@@ -4031,6 +4361,7 @@ static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
 static struct irq_chip amd_ir_chip = {
        .irq_ack = ir_ack_apic_edge,
        .irq_set_affinity = amd_ir_set_affinity,
+       .irq_set_vcpu_affinity = amd_ir_set_vcpu_affinity,
        .irq_compose_msi_msg = ir_compose_msi_msg,
 };
 
@@ -4045,4 +4376,43 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
 
        return 0;
 }
+
+int amd_iommu_update_ga(int cpu, bool is_run, void *data)
+{
+       unsigned long flags;
+       struct amd_iommu *iommu;
+       struct irq_remap_table *irt;
+       struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
+       int devid = ir_data->irq_2_irte.devid;
+       struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
+       struct irte_ga *ref = (struct irte_ga *) ir_data->ref;
+
+       if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
+           !ref || !entry || !entry->lo.fields_vapic.guest_mode)
+               return 0;
+
+       iommu = amd_iommu_rlookup_table[devid];
+       if (!iommu)
+               return -ENODEV;
+
+       irt = get_irq_table(devid, false);
+       if (!irt)
+               return -ENODEV;
+
+       spin_lock_irqsave(&irt->lock, flags);
+
+       if (ref->lo.fields_vapic.guest_mode) {
+               if (cpu >= 0)
+                       ref->lo.fields_vapic.destination = cpu;
+               ref->lo.fields_vapic.is_run = is_run;
+               barrier();
+       }
+
+       spin_unlock_irqrestore(&irt->lock, flags);
+
+       iommu_flush_irt(iommu, devid);
+       iommu_completion_wait(iommu);
+       return 0;
+}
+EXPORT_SYMBOL(amd_iommu_update_ga);
 #endif
index 59741ea..cd17136 100644 (file)
@@ -84,6 +84,7 @@
 #define ACPI_DEVFLAG_LINT1              0x80
 #define ACPI_DEVFLAG_ATSDIS             0x10000000
 
+#define LOOP_TIMEOUT   100000
 /*
  * ACPI table definitions
  *
@@ -145,6 +146,8 @@ struct ivmd_header {
 bool amd_iommu_dump;
 bool amd_iommu_irq_remap __read_mostly;
 
+int amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_VAPIC;
+
 static bool amd_iommu_detected;
 static bool __initdata amd_iommu_disabled;
 static int amd_iommu_target_ivhd_type;
@@ -386,6 +389,10 @@ static void iommu_disable(struct amd_iommu *iommu)
        iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
        iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
 
+       /* Disable IOMMU GA_LOG */
+       iommu_feature_disable(iommu, CONTROL_GALOG_EN);
+       iommu_feature_disable(iommu, CONTROL_GAINT_EN);
+
        /* Disable IOMMU hardware itself */
        iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
 }
@@ -671,6 +678,99 @@ static void __init free_ppr_log(struct amd_iommu *iommu)
        free_pages((unsigned long)iommu->ppr_log, get_order(PPR_LOG_SIZE));
 }
 
+static void free_ga_log(struct amd_iommu *iommu)
+{
+#ifdef CONFIG_IRQ_REMAP
+       if (iommu->ga_log)
+               free_pages((unsigned long)iommu->ga_log,
+                           get_order(GA_LOG_SIZE));
+       if (iommu->ga_log_tail)
+               free_pages((unsigned long)iommu->ga_log_tail,
+                           get_order(8));
+#endif
+}
+
+static int iommu_ga_log_enable(struct amd_iommu *iommu)
+{
+#ifdef CONFIG_IRQ_REMAP
+       u32 status, i;
+
+       if (!iommu->ga_log)
+               return -EINVAL;
+
+       status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+
+       /* Check if already running */
+       if (status & (MMIO_STATUS_GALOG_RUN_MASK))
+               return 0;
+
+       iommu_feature_enable(iommu, CONTROL_GAINT_EN);
+       iommu_feature_enable(iommu, CONTROL_GALOG_EN);
+
+       for (i = 0; i < LOOP_TIMEOUT; ++i) {
+               status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+               if (status & (MMIO_STATUS_GALOG_RUN_MASK))
+                       break;
+       }
+
+       if (i >= LOOP_TIMEOUT)
+               return -EINVAL;
+#endif /* CONFIG_IRQ_REMAP */
+       return 0;
+}
+
+#ifdef CONFIG_IRQ_REMAP
+static int iommu_init_ga_log(struct amd_iommu *iommu)
+{
+       u64 entry;
+
+       if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+               return 0;
+
+       iommu->ga_log = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+                                       get_order(GA_LOG_SIZE));
+       if (!iommu->ga_log)
+               goto err_out;
+
+       iommu->ga_log_tail = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+                                       get_order(8));
+       if (!iommu->ga_log_tail)
+               goto err_out;
+
+       entry = (u64)virt_to_phys(iommu->ga_log) | GA_LOG_SIZE_512;
+       memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_BASE_OFFSET,
+                   &entry, sizeof(entry));
+       entry = ((u64)virt_to_phys(iommu->ga_log) & 0xFFFFFFFFFFFFFULL) & ~7ULL;
+       memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_TAIL_OFFSET,
+                   &entry, sizeof(entry));
+       writel(0x00, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
+       writel(0x00, iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
+
+       return 0;
+err_out:
+       free_ga_log(iommu);
+       return -EINVAL;
+}
+#endif /* CONFIG_IRQ_REMAP */
+
+static int iommu_init_ga(struct amd_iommu *iommu)
+{
+       int ret = 0;
+
+#ifdef CONFIG_IRQ_REMAP
+       /* Note: We have already checked GASup from IVRS table.
+        *       Now, we need to make sure that GAMSup is set.
+        */
+       if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) &&
+           !iommu_feature(iommu, FEATURE_GAM_VAPIC))
+               amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY_GA;
+
+       ret = iommu_init_ga_log(iommu);
+#endif /* CONFIG_IRQ_REMAP */
+
+       return ret;
+}
+
 static void iommu_enable_gt(struct amd_iommu *iommu)
 {
        if (!iommu_feature(iommu, FEATURE_GT))
@@ -1144,6 +1244,7 @@ static void __init free_iommu_one(struct amd_iommu *iommu)
        free_command_buffer(iommu);
        free_event_buffer(iommu);
        free_ppr_log(iommu);
+       free_ga_log(iommu);
        iommu_unmap_mmio_space(iommu);
 }
 
@@ -1258,6 +1359,8 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
                        iommu->mmio_phys_end = MMIO_REG_END_OFFSET;
                else
                        iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET;
+               if (((h->efr_attr & (0x1 << IOMMU_FEAT_GASUP_SHIFT)) == 0))
+                       amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY;
                break;
        case 0x11:
        case 0x40:
@@ -1265,6 +1368,8 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
                        iommu->mmio_phys_end = MMIO_REG_END_OFFSET;
                else
                        iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET;
+               if (((h->efr_reg & (0x1 << IOMMU_EFR_GASUP_SHIFT)) == 0))
+                       amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY;
                break;
        default:
                return -EINVAL;
@@ -1432,6 +1537,7 @@ static int iommu_init_pci(struct amd_iommu *iommu)
 {
        int cap_ptr = iommu->cap_ptr;
        u32 range, misc, low, high;
+       int ret;
 
        iommu->dev = pci_get_bus_and_slot(PCI_BUS_NUM(iommu->devid),
                                          iommu->devid & 0xff);
@@ -1488,6 +1594,10 @@ static int iommu_init_pci(struct amd_iommu *iommu)
        if (iommu_feature(iommu, FEATURE_PPR) && alloc_ppr_log(iommu))
                return -ENOMEM;
 
+       ret = iommu_init_ga(iommu);
+       if (ret)
+               return ret;
+
        if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
                amd_iommu_np_cache = true;
 
@@ -1545,16 +1655,24 @@ static void print_iommu_info(void)
                        dev_name(&iommu->dev->dev), iommu->cap_ptr);
 
                if (iommu->cap & (1 << IOMMU_CAP_EFR)) {
-                       pr_info("AMD-Vi:  Extended features: ");
+                       pr_info("AMD-Vi: Extended features (%#llx):\n",
+                               iommu->features);
                        for (i = 0; i < ARRAY_SIZE(feat_str); ++i) {
                                if (iommu_feature(iommu, (1ULL << i)))
                                        pr_cont(" %s", feat_str[i]);
                        }
+
+                       if (iommu->features & FEATURE_GAM_VAPIC)
+                               pr_cont(" GA_vAPIC");
+
                        pr_cont("\n");
                }
        }
-       if (irq_remapping_enabled)
+       if (irq_remapping_enabled) {
                pr_info("AMD-Vi: Interrupt remapping enabled\n");
+               if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+                       pr_info("AMD-Vi: virtual APIC enabled\n");
+       }
 }
 
 static int __init amd_iommu_init_pci(void)
@@ -1645,6 +1763,8 @@ enable_faults:
        if (iommu->ppr_log != NULL)
                iommu_feature_enable(iommu, CONTROL_PPFINT_EN);
 
+       iommu_ga_log_enable(iommu);
+
        return 0;
 }
 
@@ -1862,6 +1982,24 @@ static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
                               iommu->stored_addr_lo | 1);
 }
 
+static void iommu_enable_ga(struct amd_iommu *iommu)
+{
+#ifdef CONFIG_IRQ_REMAP
+       switch (amd_iommu_guest_ir) {
+       case AMD_IOMMU_GUEST_IR_VAPIC:
+               iommu_feature_enable(iommu, CONTROL_GAM_EN);
+               /* Fall through */
+       case AMD_IOMMU_GUEST_IR_LEGACY_GA:
+               iommu_feature_enable(iommu, CONTROL_GA_EN);
+               iommu->irte_ops = &irte_128_ops;
+               break;
+       default:
+               iommu->irte_ops = &irte_32_ops;
+               break;
+       }
+#endif
+}
+
 /*
  * This function finally enables all IOMMUs found in the system after
  * they have been initialized
@@ -1877,9 +2015,15 @@ static void early_enable_iommus(void)
                iommu_enable_command_buffer(iommu);
                iommu_enable_event_buffer(iommu);
                iommu_set_exclusion_range(iommu);
+               iommu_enable_ga(iommu);
                iommu_enable(iommu);
                iommu_flush_all_caches(iommu);
        }
+
+#ifdef CONFIG_IRQ_REMAP
+       if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+               amd_iommu_irq_ops.capability |= (1 << IRQ_POSTING_CAP);
+#endif
 }
 
 static void enable_iommus_v2(void)
@@ -1905,6 +2049,11 @@ static void disable_iommus(void)
 
        for_each_iommu(iommu)
                iommu_disable(iommu);
+
+#ifdef CONFIG_IRQ_REMAP
+       if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+               amd_iommu_irq_ops.capability &= ~(1 << IRQ_POSTING_CAP);
+#endif
 }
 
 /*
@@ -2059,7 +2208,7 @@ static int __init early_amd_iommu_init(void)
        struct acpi_table_header *ivrs_base;
        acpi_size ivrs_size;
        acpi_status status;
-       int i, ret = 0;
+       int i, remap_cache_sz, ret = 0;
 
        if (!amd_iommu_detected)
                return -ENODEV;
@@ -2157,10 +2306,14 @@ static int __init early_amd_iommu_init(void)
                 * remapping tables.
                 */
                ret = -ENOMEM;
+               if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
+                       remap_cache_sz = MAX_IRQS_PER_TABLE * sizeof(u32);
+               else
+                       remap_cache_sz = MAX_IRQS_PER_TABLE * (sizeof(u64) * 2);
                amd_iommu_irq_cache = kmem_cache_create("irq_remap_cache",
-                               MAX_IRQS_PER_TABLE * sizeof(u32),
-                               IRQ_TABLE_ALIGNMENT,
-                               0, NULL);
+                                                       remap_cache_sz,
+                                                       IRQ_TABLE_ALIGNMENT,
+                                                       0, NULL);
                if (!amd_iommu_irq_cache)
                        goto out;
 
@@ -2413,6 +2566,21 @@ static int __init parse_amd_iommu_dump(char *str)
        return 1;
 }
 
+static int __init parse_amd_iommu_intr(char *str)
+{
+       for (; *str; ++str) {
+               if (strncmp(str, "legacy", 6) == 0) {
+                       amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY;
+                       break;
+               }
+               if (strncmp(str, "vapic", 5) == 0) {
+                       amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_VAPIC;
+                       break;
+               }
+       }
+       return 1;
+}
+
 static int __init parse_amd_iommu_options(char *str)
 {
        for (; *str; ++str) {
@@ -2521,6 +2689,7 @@ static int __init parse_ivrs_acpihid(char *str)
 
 __setup("amd_iommu_dump",      parse_amd_iommu_dump);
 __setup("amd_iommu=",          parse_amd_iommu_options);
+__setup("amd_iommu_intr=",     parse_amd_iommu_intr);
 __setup("ivrs_ioapic",         parse_ivrs_ioapic);
 __setup("ivrs_hpet",           parse_ivrs_hpet);
 __setup("ivrs_acpihid",                parse_ivrs_acpihid);
index 0bd9eb3..faa3b48 100644 (file)
@@ -38,6 +38,7 @@ extern int amd_iommu_enable(void);
 extern void amd_iommu_disable(void);
 extern int amd_iommu_reenable(int);
 extern int amd_iommu_enable_faulting(void);
+extern int amd_iommu_guest_ir;
 
 /* IOMMUv2 specific functions */
 struct iommu_domain;
index 9652848..0d91785 100644 (file)
@@ -22,6 +22,7 @@
 
 #include <linux/types.h>
 #include <linux/mutex.h>
+#include <linux/msi.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/pci.h>
@@ -69,6 +70,8 @@
 #define MMIO_EXCL_LIMIT_OFFSET  0x0028
 #define MMIO_EXT_FEATURES      0x0030
 #define MMIO_PPR_LOG_OFFSET    0x0038
+#define MMIO_GA_LOG_BASE_OFFSET        0x00e0
+#define MMIO_GA_LOG_TAIL_OFFSET        0x00e8
 #define MMIO_CMD_HEAD_OFFSET   0x2000
 #define MMIO_CMD_TAIL_OFFSET   0x2008
 #define MMIO_EVT_HEAD_OFFSET   0x2010
@@ -76,6 +79,8 @@
 #define MMIO_STATUS_OFFSET     0x2020
 #define MMIO_PPR_HEAD_OFFSET   0x2030
 #define MMIO_PPR_TAIL_OFFSET   0x2038
+#define MMIO_GA_HEAD_OFFSET    0x2040
+#define MMIO_GA_TAIL_OFFSET    0x2048
 #define MMIO_CNTR_CONF_OFFSET  0x4000
 #define MMIO_CNTR_REG_OFFSET   0x40000
 #define MMIO_REG_END_OFFSET    0x80000
@@ -92,6 +97,7 @@
 #define FEATURE_GA             (1ULL<<7)
 #define FEATURE_HE             (1ULL<<8)
 #define FEATURE_PC             (1ULL<<9)
+#define FEATURE_GAM_VAPIC      (1ULL<<21)
 
 #define FEATURE_PASID_SHIFT    32
 #define FEATURE_PASID_MASK     (0x1fULL << FEATURE_PASID_SHIFT)
 #define MMIO_STATUS_EVT_INT_MASK       (1 << 1)
 #define MMIO_STATUS_COM_WAIT_INT_MASK  (1 << 2)
 #define MMIO_STATUS_PPR_INT_MASK       (1 << 6)
+#define MMIO_STATUS_GALOG_RUN_MASK     (1 << 8)
+#define MMIO_STATUS_GALOG_OVERFLOW_MASK        (1 << 9)
+#define MMIO_STATUS_GALOG_INT_MASK     (1 << 10)
 
 /* event logging constants */
 #define EVENT_ENTRY_SIZE       0x10
 #define CONTROL_PPFINT_EN       0x0eULL
 #define CONTROL_PPR_EN          0x0fULL
 #define CONTROL_GT_EN           0x10ULL
+#define CONTROL_GA_EN           0x11ULL
+#define CONTROL_GAM_EN          0x19ULL
+#define CONTROL_GALOG_EN        0x1CULL
+#define CONTROL_GAINT_EN        0x1DULL
 
 #define CTRL_INV_TO_MASK       (7 << CONTROL_INV_TIMEOUT)
 #define CTRL_INV_TO_NONE       0
 
 #define PPR_REQ_FAULT          0x01
 
+/* Constants for GA Log handling */
+#define GA_LOG_ENTRIES         512
+#define GA_LOG_SIZE_SHIFT      56
+#define GA_LOG_SIZE_512                (0x8ULL << GA_LOG_SIZE_SHIFT)
+#define GA_ENTRY_SIZE          8
+#define GA_LOG_SIZE            (GA_ENTRY_SIZE * GA_LOG_ENTRIES)
+
+#define GA_TAG(x)              (u32)(x & 0xffffffffULL)
+#define GA_DEVID(x)            (u16)(((x) >> 32) & 0xffffULL)
+#define GA_REQ_TYPE(x)         (((x) >> 60) & 0xfULL)
+
+#define GA_GUEST_NR            0x1
+
 #define PAGE_MODE_NONE    0x00
 #define PAGE_MODE_1_LEVEL 0x01
 #define PAGE_MODE_2_LEVEL 0x02
 #define IOMMU_CAP_NPCACHE 26
 #define IOMMU_CAP_EFR     27
 
+/* IOMMU Feature Reporting Field (for IVHD type 10h */
+#define IOMMU_FEAT_GASUP_SHIFT 6
+
+/* IOMMU Extended Feature Register (EFR) */
+#define IOMMU_EFR_GASUP_SHIFT  7
+
 #define MAX_DOMAIN_ID 65536
 
 /* Protection domain flags */
@@ -400,6 +432,7 @@ struct amd_iommu_fault {
 
 struct iommu_domain;
 struct irq_domain;
+struct amd_irte_ops;
 
 /*
  * This structure contains generic data for  IOMMU protection domains
@@ -490,6 +523,12 @@ struct amd_iommu {
        /* Base of the PPR log, if present */
        u8 *ppr_log;
 
+       /* Base of the GA log, if present */
+       u8 *ga_log;
+
+       /* Tail of the GA log, if present */
+       u8 *ga_log_tail;
+
        /* true if interrupts for this IOMMU are already enabled */
        bool int_enabled;
 
@@ -523,6 +562,8 @@ struct amd_iommu {
 #ifdef CONFIG_IRQ_REMAP
        struct irq_domain *ir_domain;
        struct irq_domain *msi_domain;
+
+       struct amd_irte_ops *irte_ops;
 #endif
 
        volatile u64 __aligned(8) cmd_sem;
@@ -683,4 +724,112 @@ static inline int get_hpet_devid(int id)
        return -EINVAL;
 }
 
+enum amd_iommu_intr_mode_type {
+       AMD_IOMMU_GUEST_IR_LEGACY,
+
+       /* This mode is not visible to users. It is used when
+        * we cannot fully enable vAPIC and fallback to only support
+        * legacy interrupt remapping via 128-bit IRTE.
+        */
+       AMD_IOMMU_GUEST_IR_LEGACY_GA,
+       AMD_IOMMU_GUEST_IR_VAPIC,
+};
+
+#define AMD_IOMMU_GUEST_IR_GA(x)       (x == AMD_IOMMU_GUEST_IR_VAPIC || \
+                                        x == AMD_IOMMU_GUEST_IR_LEGACY_GA)
+
+#define AMD_IOMMU_GUEST_IR_VAPIC(x)    (x == AMD_IOMMU_GUEST_IR_VAPIC)
+
+union irte {
+       u32 val;
+       struct {
+               u32 valid       : 1,
+                   no_fault    : 1,
+                   int_type    : 3,
+                   rq_eoi      : 1,
+                   dm          : 1,
+                   rsvd_1      : 1,
+                   destination : 8,
+                   vector      : 8,
+                   rsvd_2      : 8;
+       } fields;
+};
+
+union irte_ga_lo {
+       u64 val;
+
+       /* For int remapping */
+       struct {
+               u64 valid       : 1,
+                   no_fault    : 1,
+                   /* ------ */
+                   int_type    : 3,
+                   rq_eoi      : 1,
+                   dm          : 1,
+                   /* ------ */
+                   guest_mode  : 1,
+                   destination : 8,
+                   rsvd        : 48;
+       } fields_remap;
+
+       /* For guest vAPIC */
+       struct {
+               u64 valid       : 1,
+                   no_fault    : 1,
+                   /* ------ */
+                   ga_log_intr : 1,
+                   rsvd1       : 3,
+                   is_run      : 1,
+                   /* ------ */
+                   guest_mode  : 1,
+                   destination : 8,
+                   rsvd2       : 16,
+                   ga_tag      : 32;
+       } fields_vapic;
+};
+
+union irte_ga_hi {
+       u64 val;
+       struct {
+               u64 vector      : 8,
+                   rsvd_1      : 4,
+                   ga_root_ptr : 40,
+                   rsvd_2      : 12;
+       } fields;
+};
+
+struct irte_ga {
+       union irte_ga_lo lo;
+       union irte_ga_hi hi;
+};
+
+struct irq_2_irte {
+       u16 devid; /* Device ID for IRTE table */
+       u16 index; /* Index into IRTE table*/
+};
+
+struct amd_ir_data {
+       u32 cached_ga_tag;
+       struct irq_2_irte irq_2_irte;
+       struct msi_msg msi_entry;
+       void *entry;    /* Pointer to union irte or struct irte_ga */
+       void *ref;      /* Pointer to the actual irte */
+};
+
+struct amd_irte_ops {
+       void (*prepare)(void *, u32, u32, u8, u32, int);
+       void (*activate)(void *, u16, u16);
+       void (*deactivate)(void *, u16, u16);
+       void (*set_affinity)(void *, u16, u16, u8, u32);
+       void *(*get)(struct irq_remap_table *, int);
+       void (*set_allocated)(struct irq_remap_table *, int);
+       bool (*is_allocated)(struct irq_remap_table *, int);
+       void (*clear_allocated)(struct irq_remap_table *, int);
+};
+
+#ifdef CONFIG_IRQ_REMAP
+extern struct amd_irte_ops irte_32_ops;
+extern struct amd_irte_ops irte_128_ops;
+#endif
+
 #endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
index 19b698e..002f092 100644 (file)
 #include <linux/kvm.h>
 #include <linux/irqreturn.h>
 #include <linux/spinlock.h>
+#include <linux/static_key.h>
 #include <linux/types.h>
 #include <kvm/iodev.h>
 #include <linux/list.h>
+#include <linux/jump_label.h>
 
 #define VGIC_V3_MAX_CPUS       255
 #define VGIC_V2_MAX_CPUS       8
@@ -49,6 +51,9 @@ struct vgic_global {
        /* Physical address of vgic virtual cpu interface */
        phys_addr_t             vcpu_base;
 
+       /* GICV mapping */
+       void __iomem            *vcpu_base_va;
+
        /* virtual control interface mapping */
        void __iomem            *vctrl_base;
 
@@ -63,6 +68,9 @@ struct vgic_global {
 
        /* Only needed for the legacy KVM_CREATE_IRQCHIP */
        bool                    can_emulate_gicv2;
+
+       /* GIC system register CPU interface */
+       struct static_key_false gicv3_cpuif;
 };
 
 extern struct vgic_global kvm_vgic_global_state;
@@ -217,7 +225,6 @@ struct vgic_v2_cpu_if {
 };
 
 struct vgic_v3_cpu_if {
-#ifdef CONFIG_KVM_ARM_VGIC_V3
        u32             vgic_hcr;
        u32             vgic_vmcr;
        u32             vgic_sre;       /* Restored only, change ignored */
@@ -227,7 +234,6 @@ struct vgic_v3_cpu_if {
        u32             vgic_ap0r[4];
        u32             vgic_ap1r[4];
        u64             vgic_lr[VGIC_V3_MAX_LRS];
-#endif
 };
 
 struct vgic_cpu {
@@ -265,6 +271,8 @@ struct vgic_cpu {
        bool lpis_enabled;
 };
 
+extern struct static_key_false vgic_v2_cpuif_trap;
+
 int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
 void kvm_vgic_early_init(struct kvm *kvm);
 int kvm_vgic_create(struct kvm *kvm, u32 type);
@@ -294,13 +302,7 @@ bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu);
 void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
 void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
 
-#ifdef CONFIG_KVM_ARM_VGIC_V3
 void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
-#else
-static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
-{
-}
-#endif
 
 /**
  * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
index 2b08e79..09751d3 100644 (file)
 
 #include <linux/types.h>
 
+/*
+ * This is mainly used to communicate information back-and-forth
+ * between SVM and IOMMU for setting up and tearing down posted
+ * interrupt
+ */
+struct amd_iommu_pi_data {
+       u32 ga_tag;
+       u32 prev_ga_tag;
+       u64 base;
+       bool is_guest_mode;
+       struct vcpu_data *vcpu_data;
+       void *ir_data;
+};
+
 #ifdef CONFIG_AMD_IOMMU
 
 struct task_struct;
@@ -168,11 +182,34 @@ typedef void (*amd_iommu_invalidate_ctx)(struct pci_dev *pdev, int pasid);
 
 extern int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev,
                                           amd_iommu_invalidate_ctx cb);
-
-#else
+#else /* CONFIG_AMD_IOMMU */
 
 static inline int amd_iommu_detect(void) { return -ENODEV; }
 
-#endif
+#endif /* CONFIG_AMD_IOMMU */
+
+#if defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP)
+
+/* IOMMU AVIC Function */
+extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32));
+
+extern int
+amd_iommu_update_ga(int cpu, bool is_run, void *data);
+
+#else /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */
+
+static inline int
+amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
+{
+       return 0;
+}
+
+static inline int
+amd_iommu_update_ga(int cpu, bool is_run, void *data)
+{
+       return 0;
+}
+
+#endif /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */
 
 #endif /* _ASM_X86_AMD_IOMMU_H */
index 9c28b4d..01c0b9c 100644 (file)
@@ -265,6 +265,7 @@ struct kvm_vcpu {
 #endif
        bool preempted;
        struct kvm_vcpu_arch arch;
+       struct dentry *debugfs_dentry;
 };
 
 static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
@@ -749,6 +750,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
 
+bool kvm_arch_has_vcpu_debugfs(void);
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu);
+
 int kvm_arch_hardware_enable(void);
 void kvm_arch_hardware_disable(void);
 int kvm_arch_hardware_setup(void);
diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config
new file mode 100644 (file)
index 0000000..8d96437
--- /dev/null
@@ -0,0 +1,32 @@
+CONFIG_NET=y
+CONFIG_NET_CORE=y
+CONFIG_NETDEVICES=y
+CONFIG_BLOCK=y
+CONFIG_BLK_DEV=y
+CONFIG_NETWORK_FILESYSTEMS=y
+CONFIG_INET=y
+CONFIG_TTY=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_BINFMT_ELF=y
+CONFIG_PCI=y
+CONFIG_PCI_MSI=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_VIRTUALIZATION=y
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PARAVIRT=y
+CONFIG_KVM_GUEST=y
+CONFIG_VIRTIO=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_VIRTIO_BLK=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_VIRTIO_NET=y
+CONFIG_9P_FS=y
+CONFIG_NET_9P=y
+CONFIG_NET_9P_VIRTIO=y
+CONFIG_SCSI_LOWLEVEL=y
+CONFIG_SCSI_VIRTIO=y
+CONFIG_VIRTIO_INPUT=y
+CONFIG_DRM_VIRTIO_GPU=y
diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c
new file mode 100644 (file)
index 0000000..528af4b
--- /dev/null
@@ -0,0 +1,152 @@
+/*
+ * (not much of an) Emulation layer for 32bit guests.
+ *
+ * Copyright (C) 2012,2013 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * based on arch/arm/kvm/emulate.c
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+
+#ifndef CONFIG_ARM64
+#define COMPAT_PSR_T_BIT       PSR_T_BIT
+#define COMPAT_PSR_IT_MASK     PSR_IT_MASK
+#endif
+
+/*
+ * stolen from arch/arm/kernel/opcodes.c
+ *
+ * condition code lookup table
+ * index into the table is test code: EQ, NE, ... LT, GT, AL, NV
+ *
+ * bit position in short is condition code: NZCV
+ */
+static const unsigned short cc_map[16] = {
+       0xF0F0,                 /* EQ == Z set            */
+       0x0F0F,                 /* NE                     */
+       0xCCCC,                 /* CS == C set            */
+       0x3333,                 /* CC                     */
+       0xFF00,                 /* MI == N set            */
+       0x00FF,                 /* PL                     */
+       0xAAAA,                 /* VS == V set            */
+       0x5555,                 /* VC                     */
+       0x0C0C,                 /* HI == C set && Z clear */
+       0xF3F3,                 /* LS == C clear || Z set */
+       0xAA55,                 /* GE == (N==V)           */
+       0x55AA,                 /* LT == (N!=V)           */
+       0x0A05,                 /* GT == (!Z && (N==V))   */
+       0xF5FA,                 /* LE == (Z || (N!=V))    */
+       0xFFFF,                 /* AL always              */
+       0                       /* NV                     */
+};
+
+/*
+ * Check if a trapped instruction should have been executed or not.
+ */
+bool kvm_condition_valid32(const struct kvm_vcpu *vcpu)
+{
+       unsigned long cpsr;
+       u32 cpsr_cond;
+       int cond;
+
+       /* Top two bits non-zero?  Unconditional. */
+       if (kvm_vcpu_get_hsr(vcpu) >> 30)
+               return true;
+
+       /* Is condition field valid? */
+       cond = kvm_vcpu_get_condition(vcpu);
+       if (cond == 0xE)
+               return true;
+
+       cpsr = *vcpu_cpsr(vcpu);
+
+       if (cond < 0) {
+               /* This can happen in Thumb mode: examine IT state. */
+               unsigned long it;
+
+               it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3);
+
+               /* it == 0 => unconditional. */
+               if (it == 0)
+                       return true;
+
+               /* The cond for this insn works out as the top 4 bits. */
+               cond = (it >> 4);
+       }
+
+       cpsr_cond = cpsr >> 28;
+
+       if (!((cc_map[cond] >> cpsr_cond) & 1))
+               return false;
+
+       return true;
+}
+
+/**
+ * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block
+ * @vcpu:      The VCPU pointer
+ *
+ * When exceptions occur while instructions are executed in Thumb IF-THEN
+ * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have
+ * to do this little bit of work manually. The fields map like this:
+ *
+ * IT[7:0] -> CPSR[26:25],CPSR[15:10]
+ */
+static void __hyp_text kvm_adjust_itstate(struct kvm_vcpu *vcpu)
+{
+       unsigned long itbits, cond;
+       unsigned long cpsr = *vcpu_cpsr(vcpu);
+       bool is_arm = !(cpsr & COMPAT_PSR_T_BIT);
+
+       if (is_arm || !(cpsr & COMPAT_PSR_IT_MASK))
+               return;
+
+       cond = (cpsr & 0xe000) >> 13;
+       itbits = (cpsr & 0x1c00) >> (10 - 2);
+       itbits |= (cpsr & (0x3 << 25)) >> 25;
+
+       /* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */
+       if ((itbits & 0x7) == 0)
+               itbits = cond = 0;
+       else
+               itbits = (itbits << 1) & 0x1f;
+
+       cpsr &= ~COMPAT_PSR_IT_MASK;
+       cpsr |= cond << 13;
+       cpsr |= (itbits & 0x1c) << (10 - 2);
+       cpsr |= (itbits & 0x3) << 25;
+       *vcpu_cpsr(vcpu) = cpsr;
+}
+
+/**
+ * kvm_skip_instr - skip a trapped instruction and proceed to the next
+ * @vcpu: The vcpu pointer
+ */
+void __hyp_text kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr)
+{
+       bool is_thumb;
+
+       is_thumb = !!(*vcpu_cpsr(vcpu) & COMPAT_PSR_T_BIT);
+       if (is_thumb && !is_wide_instr)
+               *vcpu_pc(vcpu) += 2;
+       else
+               *vcpu_pc(vcpu) += 4;
+       kvm_adjust_itstate(vcpu);
+}
index 77e6ccf..27a1f63 100644 (file)
@@ -31,7 +31,6 @@
 #include "trace.h"
 
 static struct timecounter *timecounter;
-static struct workqueue_struct *wqueue;
 static unsigned int host_vtimer_irq;
 static u32 host_vtimer_irq_flags;
 
@@ -141,7 +140,7 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt)
                return HRTIMER_RESTART;
        }
 
-       queue_work(wqueue, &timer->expired);
+       schedule_work(&timer->expired);
        return HRTIMER_NORESTART;
 }
 
@@ -446,13 +445,7 @@ int kvm_timer_hyp_init(void)
        if (err) {
                kvm_err("kvm_arch_timer: can't request interrupt %d (%d)\n",
                        host_vtimer_irq, err);
-               goto out;
-       }
-
-       wqueue = create_singlethread_workqueue("kvm_arch_timer");
-       if (!wqueue) {
-               err = -ENOMEM;
-               goto out_free;
+               return err;
        }
 
        kvm_info("virtual timer IRQ%d\n", host_vtimer_irq);
@@ -460,10 +453,6 @@ int kvm_timer_hyp_init(void)
        cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING,
                          "AP_KVM_ARM_TIMER_STARTING", kvm_timer_starting_cpu,
                          kvm_timer_dying_cpu);
-       goto out;
-out_free:
-       free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus());
-out:
        return err;
 }
 
@@ -518,7 +507,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
         * VCPUs have the enabled variable set, before entering the guest, if
         * the arch timers are enabled.
         */
-       if (timecounter && wqueue)
+       if (timecounter)
                timer->enabled = 1;
 
        return 0;
index 7cffd93..c8aeb7b 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/irqchip/arm-gic.h>
 #include <linux/kvm_host.h>
 
+#include <asm/kvm_emulate.h>
 #include <asm/kvm_hyp.h>
 
 static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu,
@@ -167,3 +168,59 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu)
        writel_relaxed(cpu_if->vgic_vmcr, base + GICH_VMCR);
        vcpu->arch.vgic_cpu.live_lrs = live_lrs;
 }
+
+#ifdef CONFIG_ARM64
+/*
+ * __vgic_v2_perform_cpuif_access -- perform a GICV access on behalf of the
+ *                                  guest.
+ *
+ * @vcpu: the offending vcpu
+ *
+ * Returns:
+ *  1: GICV access successfully performed
+ *  0: Not a GICV access
+ * -1: Illegal GICV access
+ */
+int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu)
+{
+       struct kvm *kvm = kern_hyp_va(vcpu->kvm);
+       struct vgic_dist *vgic = &kvm->arch.vgic;
+       phys_addr_t fault_ipa;
+       void __iomem *addr;
+       int rd;
+
+       /* Build the full address */
+       fault_ipa  = kvm_vcpu_get_fault_ipa(vcpu);
+       fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
+
+       /* If not for GICV, move on */
+       if (fault_ipa <  vgic->vgic_cpu_base ||
+           fault_ipa >= (vgic->vgic_cpu_base + KVM_VGIC_V2_CPU_SIZE))
+               return 0;
+
+       /* Reject anything but a 32bit access */
+       if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32))
+               return -1;
+
+       /* Not aligned? Don't bother */
+       if (fault_ipa & 3)
+               return -1;
+
+       rd = kvm_vcpu_dabt_get_rd(vcpu);
+       addr  = kern_hyp_va((kern_hyp_va(&kvm_vgic_global_state))->vcpu_base_va);
+       addr += fault_ipa - vgic->vgic_cpu_base;
+
+       if (kvm_vcpu_dabt_iswrite(vcpu)) {
+               u32 data = vcpu_data_guest_to_host(vcpu,
+                                                  vcpu_get_reg(vcpu, rd),
+                                                  sizeof(u32));
+               writel_relaxed(data, addr);
+       } else {
+               u32 data = readl_relaxed(addr);
+               vcpu_set_reg(vcpu, rd, vcpu_data_host_to_guest(vcpu, data,
+                                                              sizeof(u32)));
+       }
+
+       return 1;
+}
+#endif
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
new file mode 100644 (file)
index 0000000..3947095
--- /dev/null
@@ -0,0 +1,328 @@
+/*
+ * Copyright (C) 2012-2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/compiler.h>
+#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_hyp.h>
+
+#define vtr_to_max_lr_idx(v)           ((v) & 0xf)
+#define vtr_to_nr_pri_bits(v)          (((u32)(v) >> 29) + 1)
+
+static u64 __hyp_text __gic_v3_get_lr(unsigned int lr)
+{
+       switch (lr & 0xf) {
+       case 0:
+               return read_gicreg(ICH_LR0_EL2);
+       case 1:
+               return read_gicreg(ICH_LR1_EL2);
+       case 2:
+               return read_gicreg(ICH_LR2_EL2);
+       case 3:
+               return read_gicreg(ICH_LR3_EL2);
+       case 4:
+               return read_gicreg(ICH_LR4_EL2);
+       case 5:
+               return read_gicreg(ICH_LR5_EL2);
+       case 6:
+               return read_gicreg(ICH_LR6_EL2);
+       case 7:
+               return read_gicreg(ICH_LR7_EL2);
+       case 8:
+               return read_gicreg(ICH_LR8_EL2);
+       case 9:
+               return read_gicreg(ICH_LR9_EL2);
+       case 10:
+               return read_gicreg(ICH_LR10_EL2);
+       case 11:
+               return read_gicreg(ICH_LR11_EL2);
+       case 12:
+               return read_gicreg(ICH_LR12_EL2);
+       case 13:
+               return read_gicreg(ICH_LR13_EL2);
+       case 14:
+               return read_gicreg(ICH_LR14_EL2);
+       case 15:
+               return read_gicreg(ICH_LR15_EL2);
+       }
+
+       unreachable();
+}
+
+static void __hyp_text __gic_v3_set_lr(u64 val, int lr)
+{
+       switch (lr & 0xf) {
+       case 0:
+               write_gicreg(val, ICH_LR0_EL2);
+               break;
+       case 1:
+               write_gicreg(val, ICH_LR1_EL2);
+               break;
+       case 2:
+               write_gicreg(val, ICH_LR2_EL2);
+               break;
+       case 3:
+               write_gicreg(val, ICH_LR3_EL2);
+               break;
+       case 4:
+               write_gicreg(val, ICH_LR4_EL2);
+               break;
+       case 5:
+               write_gicreg(val, ICH_LR5_EL2);
+               break;
+       case 6:
+               write_gicreg(val, ICH_LR6_EL2);
+               break;
+       case 7:
+               write_gicreg(val, ICH_LR7_EL2);
+               break;
+       case 8:
+               write_gicreg(val, ICH_LR8_EL2);
+               break;
+       case 9:
+               write_gicreg(val, ICH_LR9_EL2);
+               break;
+       case 10:
+               write_gicreg(val, ICH_LR10_EL2);
+               break;
+       case 11:
+               write_gicreg(val, ICH_LR11_EL2);
+               break;
+       case 12:
+               write_gicreg(val, ICH_LR12_EL2);
+               break;
+       case 13:
+               write_gicreg(val, ICH_LR13_EL2);
+               break;
+       case 14:
+               write_gicreg(val, ICH_LR14_EL2);
+               break;
+       case 15:
+               write_gicreg(val, ICH_LR15_EL2);
+               break;
+       }
+}
+
+static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, int nr_lr)
+{
+       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+       int i;
+       bool expect_mi;
+
+       expect_mi = !!(cpu_if->vgic_hcr & ICH_HCR_UIE);
+
+       for (i = 0; i < nr_lr; i++) {
+               if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i)))
+                               continue;
+
+               expect_mi |= (!(cpu_if->vgic_lr[i] & ICH_LR_HW) &&
+                             (cpu_if->vgic_lr[i] & ICH_LR_EOI));
+       }
+
+       if (expect_mi) {
+               cpu_if->vgic_misr  = read_gicreg(ICH_MISR_EL2);
+
+               if (cpu_if->vgic_misr & ICH_MISR_EOI)
+                       cpu_if->vgic_eisr = read_gicreg(ICH_EISR_EL2);
+               else
+                       cpu_if->vgic_eisr = 0;
+       } else {
+               cpu_if->vgic_misr = 0;
+               cpu_if->vgic_eisr = 0;
+       }
+}
+
+void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+       u64 val;
+
+       /*
+        * Make sure stores to the GIC via the memory mapped interface
+        * are now visible to the system register interface.
+        */
+       if (!cpu_if->vgic_sre)
+               dsb(st);
+
+       cpu_if->vgic_vmcr  = read_gicreg(ICH_VMCR_EL2);
+
+       if (vcpu->arch.vgic_cpu.live_lrs) {
+               int i;
+               u32 max_lr_idx, nr_pri_bits;
+
+               cpu_if->vgic_elrsr = read_gicreg(ICH_ELSR_EL2);
+
+               write_gicreg(0, ICH_HCR_EL2);
+               val = read_gicreg(ICH_VTR_EL2);
+               max_lr_idx = vtr_to_max_lr_idx(val);
+               nr_pri_bits = vtr_to_nr_pri_bits(val);
+
+               save_maint_int_state(vcpu, max_lr_idx + 1);
+
+               for (i = 0; i <= max_lr_idx; i++) {
+                       if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i)))
+                               continue;
+
+                       if (cpu_if->vgic_elrsr & (1 << i))
+                               cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;
+                       else
+                               cpu_if->vgic_lr[i] = __gic_v3_get_lr(i);
+
+                       __gic_v3_set_lr(0, i);
+               }
+
+               switch (nr_pri_bits) {
+               case 7:
+                       cpu_if->vgic_ap0r[3] = read_gicreg(ICH_AP0R3_EL2);
+                       cpu_if->vgic_ap0r[2] = read_gicreg(ICH_AP0R2_EL2);
+               case 6:
+                       cpu_if->vgic_ap0r[1] = read_gicreg(ICH_AP0R1_EL2);
+               default:
+                       cpu_if->vgic_ap0r[0] = read_gicreg(ICH_AP0R0_EL2);
+               }
+
+               switch (nr_pri_bits) {
+               case 7:
+                       cpu_if->vgic_ap1r[3] = read_gicreg(ICH_AP1R3_EL2);
+                       cpu_if->vgic_ap1r[2] = read_gicreg(ICH_AP1R2_EL2);
+               case 6:
+                       cpu_if->vgic_ap1r[1] = read_gicreg(ICH_AP1R1_EL2);
+               default:
+                       cpu_if->vgic_ap1r[0] = read_gicreg(ICH_AP1R0_EL2);
+               }
+
+               vcpu->arch.vgic_cpu.live_lrs = 0;
+       } else {
+               cpu_if->vgic_misr  = 0;
+               cpu_if->vgic_eisr  = 0;
+               cpu_if->vgic_elrsr = 0xffff;
+               cpu_if->vgic_ap0r[0] = 0;
+               cpu_if->vgic_ap0r[1] = 0;
+               cpu_if->vgic_ap0r[2] = 0;
+               cpu_if->vgic_ap0r[3] = 0;
+               cpu_if->vgic_ap1r[0] = 0;
+               cpu_if->vgic_ap1r[1] = 0;
+               cpu_if->vgic_ap1r[2] = 0;
+               cpu_if->vgic_ap1r[3] = 0;
+       }
+
+       val = read_gicreg(ICC_SRE_EL2);
+       write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
+
+       if (!cpu_if->vgic_sre) {
+               /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
+               isb();
+               write_gicreg(1, ICC_SRE_EL1);
+       }
+}
+
+void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+       u64 val;
+       u32 max_lr_idx, nr_pri_bits;
+       u16 live_lrs = 0;
+       int i;
+
+       /*
+        * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a
+        * Group0 interrupt (as generated in GICv2 mode) to be
+        * delivered as a FIQ to the guest, with potentially fatal
+        * consequences. So we must make sure that ICC_SRE_EL1 has
+        * been actually programmed with the value we want before
+        * starting to mess with the rest of the GIC.
+        */
+       if (!cpu_if->vgic_sre) {
+               write_gicreg(0, ICC_SRE_EL1);
+               isb();
+       }
+
+       val = read_gicreg(ICH_VTR_EL2);
+       max_lr_idx = vtr_to_max_lr_idx(val);
+       nr_pri_bits = vtr_to_nr_pri_bits(val);
+
+       for (i = 0; i <= max_lr_idx; i++) {
+               if (cpu_if->vgic_lr[i] & ICH_LR_STATE)
+                       live_lrs |= (1 << i);
+       }
+
+       write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
+
+       if (live_lrs) {
+               write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
+
+               switch (nr_pri_bits) {
+               case 7:
+                       write_gicreg(cpu_if->vgic_ap0r[3], ICH_AP0R3_EL2);
+                       write_gicreg(cpu_if->vgic_ap0r[2], ICH_AP0R2_EL2);
+               case 6:
+                       write_gicreg(cpu_if->vgic_ap0r[1], ICH_AP0R1_EL2);
+               default:
+                       write_gicreg(cpu_if->vgic_ap0r[0], ICH_AP0R0_EL2);
+               }
+
+               switch (nr_pri_bits) {
+               case 7:
+                       write_gicreg(cpu_if->vgic_ap1r[3], ICH_AP1R3_EL2);
+                       write_gicreg(cpu_if->vgic_ap1r[2], ICH_AP1R2_EL2);
+               case 6:
+                       write_gicreg(cpu_if->vgic_ap1r[1], ICH_AP1R1_EL2);
+               default:
+                       write_gicreg(cpu_if->vgic_ap1r[0], ICH_AP1R0_EL2);
+               }
+
+               for (i = 0; i <= max_lr_idx; i++) {
+                       if (!(live_lrs & (1 << i)))
+                               continue;
+
+                       __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
+               }
+       }
+
+       /*
+        * Ensures that the above will have reached the
+        * (re)distributors. This ensure the guest will read the
+        * correct values from the memory-mapped interface.
+        */
+       if (!cpu_if->vgic_sre) {
+               isb();
+               dsb(sy);
+       }
+       vcpu->arch.vgic_cpu.live_lrs = live_lrs;
+
+       /*
+        * Prevent the guest from touching the GIC system registers if
+        * SRE isn't enabled for GICv3 emulation.
+        */
+       write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
+                    ICC_SRE_EL2);
+}
+
+void __hyp_text __vgic_v3_init_lrs(void)
+{
+       int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2));
+       int i;
+
+       for (i = 0; i <= max_lr_idx; i++)
+               __gic_v3_set_lr(0, i);
+}
+
+u64 __hyp_text __vgic_v3_get_ich_vtr_el2(void)
+{
+       return read_gicreg(ICH_VTR_EL2);
+}
index a027569..6e9c40e 100644 (file)
@@ -423,6 +423,14 @@ static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
        if (!kvm_arm_support_pmu_v3())
                return -ENODEV;
 
+       /*
+        * We currently require an in-kernel VGIC to use the PMU emulation,
+        * because we do not support forwarding PMU overflow interrupts to
+        * userspace yet.
+        */
+       if (!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))
+               return -ENODEV;
+
        if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features) ||
            !kvm_arm_pmu_irq_initialized(vcpu))
                return -ENXIO;
index 83777c1..8cebfbc 100644 (file)
@@ -405,6 +405,10 @@ int kvm_vgic_hyp_init(void)
                break;
        case GIC_V3:
                ret = vgic_v3_probe(gic_kvm_info);
+               if (!ret) {
+                       static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif);
+                       kvm_info("GIC system register CPU interface enabled\n");
+               }
                break;
        default:
                ret = -ENODEV;
index b31a51a..d918dcf 100644 (file)
@@ -46,15 +46,9 @@ static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e,
  * @ue: user api routing entry handle
  * return 0 on success, -EINVAL on errors.
  */
-#ifdef KVM_CAP_X2APIC_API
 int kvm_set_routing_entry(struct kvm *kvm,
                          struct kvm_kernel_irq_routing_entry *e,
                          const struct kvm_irq_routing_entry *ue)
-#else
-/* Remove this version and the ifdefery once merged into 4.8 */
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
-                         const struct kvm_irq_routing_entry *ue)
-#endif
 {
        int r = -EINVAL;
 
index 1813f93..ce1f4ed 100644 (file)
@@ -71,7 +71,6 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
                addr_ptr = &vgic->vgic_cpu_base;
                alignment = SZ_4K;
                break;
-#ifdef CONFIG_KVM_ARM_VGIC_V3
        case KVM_VGIC_V3_ADDR_TYPE_DIST:
                type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
                addr_ptr = &vgic->vgic_dist_base;
@@ -82,7 +81,6 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
                addr_ptr = &vgic->vgic_redist_base;
                alignment = SZ_64K;
                break;
-#endif
        default:
                r = -ENODEV;
                goto out;
@@ -219,52 +217,65 @@ int kvm_register_vgic_device(unsigned long type)
                ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
                                              KVM_DEV_TYPE_ARM_VGIC_V2);
                break;
-#ifdef CONFIG_KVM_ARM_VGIC_V3
        case KVM_DEV_TYPE_ARM_VGIC_V3:
                ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
                                              KVM_DEV_TYPE_ARM_VGIC_V3);
+
+#ifdef CONFIG_KVM_ARM_VGIC_V3_ITS
                if (ret)
                        break;
                ret = kvm_vgic_register_its_device();
-               break;
 #endif
+               break;
        }
 
        return ret;
 }
 
-/** vgic_attr_regs_access: allows user space to read/write VGIC registers
- *
- * @dev: kvm device handle
- * @attr: kvm device attribute
- * @reg: address the value is read or written
- * @is_write: write flag
- *
- */
-static int vgic_attr_regs_access(struct kvm_device *dev,
-                                struct kvm_device_attr *attr,
-                                u32 *reg, bool is_write)
-{
+struct vgic_reg_attr {
+       struct kvm_vcpu *vcpu;
        gpa_t addr;
-       int cpuid, ret, c;
-       struct kvm_vcpu *vcpu, *tmp_vcpu;
-       int vcpu_lock_idx = -1;
+};
+
+static int parse_vgic_v2_attr(struct kvm_device *dev,
+                             struct kvm_device_attr *attr,
+                             struct vgic_reg_attr *reg_attr)
+{
+       int cpuid;
 
        cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
                 KVM_DEV_ARM_VGIC_CPUID_SHIFT;
-       vcpu = kvm_get_vcpu(dev->kvm, cpuid);
-       addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
 
-       mutex_lock(&dev->kvm->lock);
+       if (cpuid >= atomic_read(&dev->kvm->online_vcpus))
+               return -EINVAL;
 
-       ret = vgic_init(dev->kvm);
-       if (ret)
-               goto out;
+       reg_attr->vcpu = kvm_get_vcpu(dev->kvm, cpuid);
+       reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
 
-       if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) {
-               ret = -EINVAL;
-               goto out;
+       return 0;
+}
+
+/* unlocks vcpus from @vcpu_lock_idx and smaller */
+static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
+{
+       struct kvm_vcpu *tmp_vcpu;
+
+       for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
+               tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
+               mutex_unlock(&tmp_vcpu->mutex);
        }
+}
+
+static void unlock_all_vcpus(struct kvm *kvm)
+{
+       unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
+}
+
+/* Returns true if all vcpus were locked, false otherwise */
+static bool lock_all_vcpus(struct kvm *kvm)
+{
+       struct kvm_vcpu *tmp_vcpu;
+       int c;
 
        /*
         * Any time a vcpu is run, vcpu_load is called which tries to grab the
@@ -272,11 +283,49 @@ static int vgic_attr_regs_access(struct kvm_device *dev,
         * that no other VCPUs are run and fiddle with the vgic state while we
         * access it.
         */
-       ret = -EBUSY;
-       kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) {
-               if (!mutex_trylock(&tmp_vcpu->mutex))
-                       goto out;
-               vcpu_lock_idx = c;
+       kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
+               if (!mutex_trylock(&tmp_vcpu->mutex)) {
+                       unlock_vcpus(kvm, c - 1);
+                       return false;
+               }
+       }
+
+       return true;
+}
+
+/**
+ * vgic_attr_regs_access_v2 - allows user space to access VGIC v2 state
+ *
+ * @dev:      kvm device handle
+ * @attr:     kvm device attribute
+ * @reg:      address the value is read or written
+ * @is_write: true if userspace is writing a register
+ */
+static int vgic_attr_regs_access_v2(struct kvm_device *dev,
+                                   struct kvm_device_attr *attr,
+                                   u32 *reg, bool is_write)
+{
+       struct vgic_reg_attr reg_attr;
+       gpa_t addr;
+       struct kvm_vcpu *vcpu;
+       int ret;
+
+       ret = parse_vgic_v2_attr(dev, attr, &reg_attr);
+       if (ret)
+               return ret;
+
+       vcpu = reg_attr.vcpu;
+       addr = reg_attr.addr;
+
+       mutex_lock(&dev->kvm->lock);
+
+       ret = vgic_init(dev->kvm);
+       if (ret)
+               goto out;
+
+       if (!lock_all_vcpus(dev->kvm)) {
+               ret = -EBUSY;
+               goto out;
        }
 
        switch (attr->group) {
@@ -291,18 +340,12 @@ static int vgic_attr_regs_access(struct kvm_device *dev,
                break;
        }
 
+       unlock_all_vcpus(dev->kvm);
 out:
-       for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
-               tmp_vcpu = kvm_get_vcpu(dev->kvm, vcpu_lock_idx);
-               mutex_unlock(&tmp_vcpu->mutex);
-       }
-
        mutex_unlock(&dev->kvm->lock);
        return ret;
 }
 
-/* V2 ops */
-
 static int vgic_v2_set_attr(struct kvm_device *dev,
                            struct kvm_device_attr *attr)
 {
@@ -321,7 +364,7 @@ static int vgic_v2_set_attr(struct kvm_device *dev,
                if (get_user(reg, uaddr))
                        return -EFAULT;
 
-               return vgic_attr_regs_access(dev, attr, &reg, true);
+               return vgic_attr_regs_access_v2(dev, attr, &reg, true);
        }
        }
 
@@ -343,7 +386,7 @@ static int vgic_v2_get_attr(struct kvm_device *dev,
                u32 __user *uaddr = (u32 __user *)(long)attr->addr;
                u32 reg = 0;
 
-               ret = vgic_attr_regs_access(dev, attr, &reg, false);
+               ret = vgic_attr_regs_access_v2(dev, attr, &reg, false);
                if (ret)
                        return ret;
                return put_user(reg, uaddr);
@@ -387,10 +430,6 @@ struct kvm_device_ops kvm_arm_vgic_v2_ops = {
        .has_attr = vgic_v2_has_attr,
 };
 
-/* V3 ops */
-
-#ifdef CONFIG_KVM_ARM_VGIC_V3
-
 static int vgic_v3_set_attr(struct kvm_device *dev,
                            struct kvm_device_attr *attr)
 {
@@ -433,5 +472,3 @@ struct kvm_device_ops kvm_arm_vgic_v3_ops = {
        .get_attr = vgic_v3_get_attr,
        .has_attr = vgic_v3_has_attr,
 };
-
-#endif /* CONFIG_KVM_ARM_VGIC_V3 */
index 90d8181..0d3c76a 100644 (file)
@@ -23,7 +23,7 @@
 #include "vgic-mmio.h"
 
 /* extract @num bytes at @offset bytes offset in data */
-unsigned long extract_bytes(unsigned long data, unsigned int offset,
+unsigned long extract_bytes(u64 data, unsigned int offset,
                            unsigned int num)
 {
        return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0);
@@ -42,6 +42,7 @@ u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
        return reg | ((u64)val << lower);
 }
 
+#ifdef CONFIG_KVM_ARM_VGIC_V3_ITS
 bool vgic_has_its(struct kvm *kvm)
 {
        struct vgic_dist *dist = &kvm->arch.vgic;
@@ -51,6 +52,7 @@ bool vgic_has_its(struct kvm *kvm)
 
        return dist->has_its;
 }
+#endif
 
 static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
                                            gpa_t addr, unsigned int len)
@@ -179,7 +181,7 @@ static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
        int target_vcpu_id = vcpu->vcpu_id;
        u64 value;
 
-       value = (mpidr & GENMASK(23, 0)) << 32;
+       value = (u64)(mpidr & GENMASK(23, 0)) << 32;
        value |= ((target_vcpu_id & 0xffff) << 8);
        if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
                value |= GICR_TYPER_LAST;
@@ -609,7 +611,7 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
        bool broadcast;
 
        sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
-       broadcast = reg & BIT(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
+       broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
        target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
        mpidr = SGI_AFFINITY_LEVEL(reg, 3);
        mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
index 3bad3c5..e18b30d 100644 (file)
@@ -550,11 +550,9 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
        case VGIC_V2:
                len = vgic_v2_init_dist_iodev(io_device);
                break;
-#ifdef CONFIG_KVM_ARM_VGIC_V3
        case VGIC_V3:
                len = vgic_v3_init_dist_iodev(io_device);
                break;
-#endif
        default:
                BUG_ON(1);
        }
index 0b3ecf9..4c34d39 100644 (file)
@@ -96,7 +96,7 @@ unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len);
 void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
                                unsigned long data);
 
-unsigned long extract_bytes(unsigned long data, unsigned int offset,
+unsigned long extract_bytes(u64 data, unsigned int offset,
                            unsigned int num);
 
 u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
@@ -162,12 +162,10 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
 
 unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
 
-#ifdef CONFIG_KVM_ARM_VGIC_V3
 u64 vgic_sanitise_outer_cacheability(u64 reg);
 u64 vgic_sanitise_inner_cacheability(u64 reg);
 u64 vgic_sanitise_shareability(u64 reg);
 u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
                        u64 (*sanitise_fn)(u64));
-#endif
 
 #endif
index 0bf6709..0a063af 100644 (file)
@@ -278,12 +278,14 @@ int vgic_v2_map_resources(struct kvm *kvm)
                goto out;
        }
 
-       ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
-                                   kvm_vgic_global_state.vcpu_base,
-                                   KVM_VGIC_V2_CPU_SIZE, true);
-       if (ret) {
-               kvm_err("Unable to remap VGIC CPU to VCPU\n");
-               goto out;
+       if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) {
+               ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
+                                           kvm_vgic_global_state.vcpu_base,
+                                           KVM_VGIC_V2_CPU_SIZE, true);
+               if (ret) {
+                       kvm_err("Unable to remap VGIC CPU to VCPU\n");
+                       goto out;
+               }
        }
 
        dist->ready = true;
@@ -294,6 +296,8 @@ out:
        return ret;
 }
 
+DEFINE_STATIC_KEY_FALSE(vgic_v2_cpuif_trap);
+
 /**
  * vgic_v2_probe - probe for a GICv2 compatible interrupt controller in DT
  * @node:      pointer to the DT node
@@ -310,45 +314,51 @@ int vgic_v2_probe(const struct gic_kvm_info *info)
                return -ENXIO;
        }
 
-       if (!PAGE_ALIGNED(info->vcpu.start)) {
-               kvm_err("GICV physical address 0x%llx not page aligned\n",
-                       (unsigned long long)info->vcpu.start);
-               return -ENXIO;
-       }
+       if (!PAGE_ALIGNED(info->vcpu.start) ||
+           !PAGE_ALIGNED(resource_size(&info->vcpu))) {
+               kvm_info("GICV region size/alignment is unsafe, using trapping (reduced performance)\n");
+               kvm_vgic_global_state.vcpu_base_va = ioremap(info->vcpu.start,
+                                                            resource_size(&info->vcpu));
+               if (!kvm_vgic_global_state.vcpu_base_va) {
+                       kvm_err("Cannot ioremap GICV\n");
+                       return -ENOMEM;
+               }
 
-       if (!PAGE_ALIGNED(resource_size(&info->vcpu))) {
-               kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n",
-                       (unsigned long long)resource_size(&info->vcpu),
-                       PAGE_SIZE);
-               return -ENXIO;
+               ret = create_hyp_io_mappings(kvm_vgic_global_state.vcpu_base_va,
+                                            kvm_vgic_global_state.vcpu_base_va + resource_size(&info->vcpu),
+                                            info->vcpu.start);
+               if (ret) {
+                       kvm_err("Cannot map GICV into hyp\n");
+                       goto out;
+               }
+
+               static_branch_enable(&vgic_v2_cpuif_trap);
        }
 
        kvm_vgic_global_state.vctrl_base = ioremap(info->vctrl.start,
                                                   resource_size(&info->vctrl));
        if (!kvm_vgic_global_state.vctrl_base) {
                kvm_err("Cannot ioremap GICH\n");
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto out;
        }
 
        vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR);
        kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1;
 
-       ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
-       if (ret) {
-               kvm_err("Cannot register GICv2 KVM device\n");
-               iounmap(kvm_vgic_global_state.vctrl_base);
-               return ret;
-       }
-
        ret = create_hyp_io_mappings(kvm_vgic_global_state.vctrl_base,
                                     kvm_vgic_global_state.vctrl_base +
                                         resource_size(&info->vctrl),
                                     info->vctrl.start);
        if (ret) {
                kvm_err("Cannot map VCTRL into hyp\n");
-               kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2);
-               iounmap(kvm_vgic_global_state.vctrl_base);
-               return ret;
+               goto out;
+       }
+
+       ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+       if (ret) {
+               kvm_err("Cannot register GICv2 KVM device\n");
+               goto out;
        }
 
        kvm_vgic_global_state.can_emulate_gicv2 = true;
@@ -359,4 +369,11 @@ int vgic_v2_probe(const struct gic_kvm_info *info)
        kvm_info("vgic-v2@%llx\n", info->vctrl.start);
 
        return 0;
+out:
+       if (kvm_vgic_global_state.vctrl_base)
+               iounmap(kvm_vgic_global_state.vctrl_base);
+       if (kvm_vgic_global_state.vcpu_base_va)
+               iounmap(kvm_vgic_global_state.vcpu_base_va);
+
+       return ret;
 }
index e83b7fe..2893d5b 100644 (file)
@@ -29,7 +29,7 @@
 #define DEBUG_SPINLOCK_BUG_ON(p)
 #endif
 
-struct vgic_global __section(.hyp.text) kvm_vgic_global_state;
+struct vgic_global __section(.hyp.text) kvm_vgic_global_state = {.gicv3_cpuif = STATIC_KEY_FALSE_INIT,};
 
 /*
  * Locking order is always:
@@ -645,6 +645,9 @@ next:
 /* Sync back the hardware VGIC state into our emulation after a guest's run. */
 void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 {
+       if (unlikely(!vgic_initialized(vcpu->kvm)))
+               return;
+
        vgic_process_maintenance_interrupt(vcpu);
        vgic_fold_lr_state(vcpu);
        vgic_prune_ap_list(vcpu);
@@ -653,6 +656,9 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 /* Flush our emulation state into the GIC hardware before entering the guest. */
 void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
 {
+       if (unlikely(!vgic_initialized(vcpu->kvm)))
+               return;
+
        spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
        vgic_flush_lr_state(vcpu);
        spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
index 6c4625c..9d9e014 100644 (file)
@@ -72,7 +72,6 @@ static inline void vgic_get_irq_kref(struct vgic_irq *irq)
        kref_get(&irq->refcount);
 }
 
-#ifdef CONFIG_KVM_ARM_VGIC_V3
 void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu);
 void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
 void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
@@ -84,63 +83,14 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu);
 int vgic_v3_probe(const struct gic_kvm_info *info);
 int vgic_v3_map_resources(struct kvm *kvm);
 int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address);
+
+#ifdef CONFIG_KVM_ARM_VGIC_V3_ITS
 int vgic_register_its_iodevs(struct kvm *kvm);
 bool vgic_has_its(struct kvm *kvm);
 int kvm_vgic_register_its_device(void);
 void vgic_enable_lpis(struct kvm_vcpu *vcpu);
 int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
 #else
-static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu)
-{
-}
-
-static inline void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
-{
-}
-
-static inline void vgic_v3_populate_lr(struct kvm_vcpu *vcpu,
-                                      struct vgic_irq *irq, int lr)
-{
-}
-
-static inline void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
-{
-}
-
-static inline void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
-{
-}
-
-static inline
-void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
-}
-
-static inline
-void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
-}
-
-static inline void vgic_v3_enable(struct kvm_vcpu *vcpu)
-{
-}
-
-static inline int vgic_v3_probe(const struct gic_kvm_info *info)
-{
-       return -ENODEV;
-}
-
-static inline int vgic_v3_map_resources(struct kvm *kvm)
-{
-       return -ENODEV;
-}
-
-static inline int vgic_register_redist_iodevs(struct kvm *kvm,
-                                             gpa_t dist_base_address)
-{
-       return -ENODEV;
-}
-
 static inline int vgic_register_its_iodevs(struct kvm *kvm)
 {
        return -ENODEV;
index e469b60..f397e9b 100644 (file)
@@ -42,7 +42,6 @@
 
 #ifdef CONFIG_HAVE_KVM_IRQFD
 
-static struct workqueue_struct *irqfd_cleanup_wq;
 
 static void
 irqfd_inject(struct work_struct *work)
@@ -168,7 +167,7 @@ irqfd_deactivate(struct kvm_kernel_irqfd *irqfd)
 
        list_del_init(&irqfd->list);
 
-       queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
+       schedule_work(&irqfd->shutdown);
 }
 
 int __attribute__((weak)) kvm_arch_set_irq_inatomic(
@@ -555,7 +554,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
         * so that we guarantee there will not be any more interrupts on this
         * gsi once this deassign function returns.
         */
-       flush_workqueue(irqfd_cleanup_wq);
+       flush_work(&irqfd->shutdown);
 
        return 0;
 }
@@ -592,7 +591,7 @@ kvm_irqfd_release(struct kvm *kvm)
         * Block until we know all outstanding shutdown jobs have completed
         * since we do not take a kvm* reference.
         */
-       flush_workqueue(irqfd_cleanup_wq);
+       flush_work(&irqfd->shutdown);
 
 }
 
@@ -622,23 +621,8 @@ void kvm_irq_routing_update(struct kvm *kvm)
        spin_unlock_irq(&kvm->irqfds.lock);
 }
 
-/*
- * create a host-wide workqueue for issuing deferred shutdown requests
- * aggregated from all vm* instances. We need our own isolated single-thread
- * queue to prevent deadlock against flushing the normal work-queue.
- */
-int kvm_irqfd_init(void)
-{
-       irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup");
-       if (!irqfd_cleanup_wq)
-               return -ENOMEM;
-
-       return 0;
-}
-
 void kvm_irqfd_exit(void)
 {
-       destroy_workqueue(irqfd_cleanup_wq);
 }
 #endif
 
index 1950782..81dfc73 100644 (file)
@@ -559,9 +559,11 @@ static void kvm_destroy_vm_debugfs(struct kvm *kvm)
 
        debugfs_remove_recursive(kvm->debugfs_dentry);
 
-       for (i = 0; i < kvm_debugfs_num_entries; i++)
-               kfree(kvm->debugfs_stat_data[i]);
-       kfree(kvm->debugfs_stat_data);
+       if (kvm->debugfs_stat_data) {
+               for (i = 0; i < kvm_debugfs_num_entries; i++)
+                       kfree(kvm->debugfs_stat_data[i]);
+               kfree(kvm->debugfs_stat_data);
+       }
 }
 
 static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
@@ -2369,6 +2371,7 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
 {
        struct kvm_vcpu *vcpu = filp->private_data;
 
+       debugfs_remove_recursive(vcpu->debugfs_dentry);
        kvm_put_kvm(vcpu->kvm);
        return 0;
 }
@@ -2391,6 +2394,32 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
        return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
 }
 
+static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+       char dir_name[ITOA_MAX_LEN * 2];
+       int ret;
+
+       if (!kvm_arch_has_vcpu_debugfs())
+               return 0;
+
+       if (!debugfs_initialized())
+               return 0;
+
+       snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
+       vcpu->debugfs_dentry = debugfs_create_dir(dir_name,
+                                                               vcpu->kvm->debugfs_dentry);
+       if (!vcpu->debugfs_dentry)
+               return -ENOMEM;
+
+       ret = kvm_arch_create_vcpu_debugfs(vcpu);
+       if (ret < 0) {
+               debugfs_remove_recursive(vcpu->debugfs_dentry);
+               return ret;
+       }
+
+       return 0;
+}
+
 /*
  * Creates some virtual cpus.  Good luck creating more than one.
  */
@@ -2423,6 +2452,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
        if (r)
                goto vcpu_destroy;
 
+       r = kvm_create_vcpu_debugfs(vcpu);
+       if (r)
+               goto vcpu_destroy;
+
        mutex_lock(&kvm->lock);
        if (kvm_get_vcpu_by_id(kvm, id)) {
                r = -EEXIST;
@@ -2454,6 +2487,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 
 unlock_vcpu_destroy:
        mutex_unlock(&kvm->lock);
+       debugfs_remove_recursive(vcpu->debugfs_dentry);
 vcpu_destroy:
        kvm_arch_vcpu_destroy(vcpu);
 vcpu_decrement:
@@ -3619,7 +3653,7 @@ static int vm_stat_get_per_vm(void *data, u64 *val)
 {
        struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
 
-       *val = *(u32 *)((void *)stat_data->kvm + stat_data->offset);
+       *val = *(ulong *)((void *)stat_data->kvm + stat_data->offset);
 
        return 0;
 }
@@ -3649,7 +3683,7 @@ static int vcpu_stat_get_per_vm(void *data, u64 *val)
        *val = 0;
 
        kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
-               *val += *(u32 *)((void *)vcpu + stat_data->offset);
+               *val += *(u64 *)((void *)vcpu + stat_data->offset);
 
        return 0;
 }
@@ -3807,12 +3841,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
         * kvm_arch_init makes sure there's at most one caller
         * for architectures that support multiple implementations,
         * like intel and amd on x86.
-        * kvm_arch_init must be called before kvm_irqfd_init to avoid creating
-        * conflicts in case kvm is already setup for another implementation.
         */
-       r = kvm_irqfd_init();
-       if (r)
-               goto out_irqfd;
 
        if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
                r = -ENOMEM;
@@ -3894,7 +3923,6 @@ out_free_0a:
        free_cpumask_var(cpus_hardware_enabled);
 out_free_0:
        kvm_irqfd_exit();
-out_irqfd:
        kvm_arch_exit();
 out_fail:
        return r;