- HYPERV_CPUID_ENLIGHTMENT_INFO
- HYPERV_CPUID_IMPLEMENT_LIMITS
- HYPERV_CPUID_NESTED_FEATURES
+ - HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS
+ - HYPERV_CPUID_SYNDBG_INTERFACE
+ - HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES
HYPERV_CPUID_NESTED_FEATURES leaf is only exposed when Enlightened VMCS was
enabled on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS).
-Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure
+Userspace invokes KVM_GET_SUPPORTED_HV_CPUID by passing a kvm_cpuid2 structure
with the 'nent' field indicating the number of entries in the variable-size
array 'entries'. If the number of entries is too low to describe all Hyper-V
feature leaves, an error (E2BIG) is returned. If the number is more or equal
Verify the integrity of the unpacked image. Only if this succeeds,
KVM is allowed to start protected VCPUs.
+4.126 KVM_X86_SET_MSR_FILTER
+----------------------------
+
+:Capability: KVM_X86_SET_MSR_FILTER
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_msr_filter
+:Returns: 0 on success, < 0 on error
+
+::
+
+ struct kvm_msr_filter_range {
+ #define KVM_MSR_FILTER_READ (1 << 0)
+ #define KVM_MSR_FILTER_WRITE (1 << 1)
+ __u32 flags;
+ __u32 nmsrs; /* number of msrs in bitmap */
+ __u32 base; /* MSR index the bitmap starts at */
+ __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */
+ };
+
+ #define KVM_MSR_FILTER_MAX_RANGES 16
+ struct kvm_msr_filter {
+ #define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0)
+ #define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0)
+ __u32 flags;
+ struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES];
+ };
+
+flags values for struct kvm_msr_filter_range:
+
+KVM_MSR_FILTER_READ
+
+ Filter read accesses to MSRs using the given bitmap. A 0 in the bitmap
+ indicates that a read should immediately fail, while a 1 indicates that
+ a read for a particular MSR should be handled regardless of the default
+ filter action.
+
+KVM_MSR_FILTER_WRITE
+
+ Filter write accesses to MSRs using the given bitmap. A 0 in the bitmap
+ indicates that a write should immediately fail, while a 1 indicates that
+ a write for a particular MSR should be handled regardless of the default
+ filter action.
+
+KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE
+
+ Filter both read and write accesses to MSRs using the given bitmap. A 0
+ in the bitmap indicates that both reads and writes should immediately fail,
+ while a 1 indicates that reads and writes for a particular MSR are not
+ filtered by this range.
+
+flags values for struct kvm_msr_filter:
+
+KVM_MSR_FILTER_DEFAULT_ALLOW
+
+ If no filter range matches an MSR index that is getting accessed, KVM will
+ fall back to allowing access to the MSR.
+
+KVM_MSR_FILTER_DEFAULT_DENY
+
+ If no filter range matches an MSR index that is getting accessed, KVM will
+ fall back to rejecting access to the MSR. In this mode, all MSRs that should
+ be processed by KVM need to explicitly be marked as allowed in the bitmaps.
+
+This ioctl allows user space to define up to 16 bitmaps of MSR ranges to
+specify whether a certain MSR access should be explicitly filtered for or not.
+
+If this ioctl has never been invoked, MSR accesses are not guarded and the
+old KVM in-kernel emulation behavior is fully preserved.
+
+As soon as the filtering is in place, every MSR access is processed through
+the filtering. If a bit is within one of the defined ranges, read and write
+accesses are guarded by the bitmap's value for the MSR index. If it is not
+defined in any range, whether MSR access is rejected is determined by the flags
+field in the kvm_msr_filter struct: KVM_MSR_FILTER_DEFAULT_ALLOW and
+KVM_MSR_FILTER_DEFAULT_DENY.
+
+Calling this ioctl with an empty set of ranges (all nmsrs == 0) disables MSR
+filtering. In that mode, KVM_MSR_FILTER_DEFAULT_DENY no longer has any effect.
+
+Each bitmap range specifies a range of MSRs to potentially allow access on.
+The range goes from MSR index [base .. base+nmsrs]. The flags field
+indicates whether reads, writes or both reads and writes are filtered
+by setting a 1 bit in the bitmap for the corresponding MSR index.
+
+If an MSR access is not permitted through the filtering, it generates a
+#GP inside the guest. When combined with KVM_CAP_X86_USER_SPACE_MSR, that
+allows user space to deflect and potentially handle various MSR accesses
+into user space.
+
+If a vCPU is in running state while this ioctl is invoked, the vCPU may
+experience inconsistent filtering behavior on MSR accesses.
+
5. The kvm_run structure
========================
.. note::
- For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR and
- KVM_EXIT_EPR the corresponding
-
-operations are complete (and guest state is consistent) only after userspace
-has re-entered the kernel with KVM_RUN. The kernel side will first finish
-incomplete operations and then check for pending signals. Userspace
-can re-enter the guest with an unmasked signal pending to complete
-pending operations.
+ For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR,
+ KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
+ operations are complete (and guest state is consistent) only after userspace
+ has re-entered the kernel with KVM_RUN. The kernel side will first finish
+ incomplete operations and then check for pending signals. Userspace
+ can re-enter the guest with an unmasked signal pending to complete
+ pending operations.
::
KVM_EXIT_MMIO, but userspace has to emulate any change to the processing state
if it decides to decode and emulate the instruction.
+::
+
+ /* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */
+ struct {
+ __u8 error; /* user -> kernel */
+ __u8 pad[7];
+ __u32 reason; /* kernel -> user */
+ __u32 index; /* kernel -> user */
+ __u64 data; /* kernel <-> user */
+ } msr;
+
+Used on x86 systems. When the VM capability KVM_CAP_X86_USER_SPACE_MSR is
+enabled, MSR accesses to registers that would invoke a #GP by KVM kernel code
+will instead trigger a KVM_EXIT_X86_RDMSR exit for reads and KVM_EXIT_X86_WRMSR
+exit for writes.
+
+The "reason" field specifies why the MSR trap occurred. User space will only
+receive MSR exit traps when a particular reason was requested during through
+ENABLE_CAP. Currently valid exit reasons are:
+
+ KVM_MSR_EXIT_REASON_UNKNOWN - access to MSR that is unknown to KVM
+ KVM_MSR_EXIT_REASON_INVAL - access to invalid MSRs or reserved bits
+ KVM_MSR_EXIT_REASON_FILTER - access blocked by KVM_X86_SET_MSR_FILTER
+
+For KVM_EXIT_X86_RDMSR, the "index" field tells user space which MSR the guest
+wants to read. To respond to this request with a successful read, user space
+writes the respective data into the "data" field and must continue guest
+execution to ensure the read data is transferred into guest register state.
+
+If the RDMSR request was unsuccessful, user space indicates that with a "1" in
+the "error" field. This will inject a #GP into the guest when the VCPU is
+executed again.
+
+For KVM_EXIT_X86_WRMSR, the "index" field tells user space which MSR the guest
+wants to write. Once finished processing the event, user space must continue
+vCPU execution. If the MSR write was unsuccessful, user space also sets the
+"error" field to "1".
+
::
/* Fix the size of the union. */
the maximum halt time to specified on a per-VM basis, effectively overriding
the module parameter for the target VM.
+7.21 KVM_CAP_X86_USER_SPACE_MSR
+-------------------------------
+
+:Architectures: x86
+:Target: VM
+:Parameters: args[0] contains the mask of KVM_MSR_EXIT_REASON_* events to report
+:Returns: 0 on success; -1 on error
+
+This capability enables trapping of #GP invoking RDMSR and WRMSR instructions
+into user space.
+
+When a guest requests to read or write an MSR, KVM may not implement all MSRs
+that are relevant to a respective system. It also does not differentiate by
+CPU type.
+
+To allow more fine grained control over MSR handling, user space may enable
+this capability. With it enabled, MSR accesses that match the mask specified in
+args[0] and trigger a #GP event inside the guest by KVM will instead trigger
+KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications which user space
+can then handle to implement model specific MSR handling and/or user notifications
+to inform a user that an MSR was not handled.
+
8. Other capabilities.
======================
is supported, than the other should as well and vice versa. For arm64
see Documentation/virt/kvm/devices/vcpu.rst "KVM_ARM_VCPU_PVTIME_CTRL".
For x86 see Documentation/virt/kvm/msr.rst "MSR_KVM_STEAL_TIME".
+
+8.25 KVM_CAP_S390_DIAG318
+-------------------------
+
+:Architectures: s390
+
+This capability enables a guest to set information about its control program
+(i.e. guest kernel type and version). The information is helpful during
+system/firmware service events, providing additional data about the guest
+environments running on the machine.
+
+The information is associated with the DIAGNOSE 0x318 instruction, which sets
+an 8-byte value consisting of a one-byte Control Program Name Code (CPNC) and
+a 7-byte Control Program Version Code (CPVC). The CPNC determines what
+environment the control program is running in (e.g. Linux, z/VM...), and the
+CPVC is used for information specific to OS (e.g. Linux version, Linux
+distribution...)
+
+If this capability is available, then the CPNC and CPVC can be synchronized
+between KVM and userspace via the sync regs mechanism (KVM_SYNC_DIAG318).
+
+8.26 KVM_CAP_X86_USER_SPACE_MSR
+-------------------------------
+
+:Architectures: x86
+
+This capability indicates that KVM supports deflection of MSR reads and
+writes to user space. It can be enabled on a VM level. If enabled, MSR
+accesses that would usually trigger a #GP by KVM into the guest will
+instead get bounced to user space through the KVM_EXIT_X86_RDMSR and
+KVM_EXIT_X86_WRMSR exit notifications.
+
+8.25 KVM_X86_SET_MSR_FILTER
+---------------------------
+
+:Architectures: x86
+
+This capability indicates that KVM supports that accesses to user defined MSRs
+may be rejected. With this capability exposed, KVM exports new VM ioctl
+KVM_X86_SET_MSR_FILTER which user space can call to specify bitmaps of MSR
+ranges that KVM should reject access to.
+
+In combination with KVM_CAP_X86_USER_SPACE_MSR, this allows user space to
+trap and emulate MSRs that are outside of the scope of KVM as well as
+limit the attack surface on KVM's MSR emulation code.
return (kvm_vcpu_get_esr(vcpu) & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT;
}
-static __always_inline bool kvm_vcpu_dabt_iss1tw(const struct kvm_vcpu *vcpu)
+static __always_inline bool kvm_vcpu_abt_iss1tw(const struct kvm_vcpu *vcpu)
{
return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_S1PTW);
}
+/* Always check for S1PTW *before* using this. */
static __always_inline bool kvm_vcpu_dabt_iswrite(const struct kvm_vcpu *vcpu)
{
- return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_WNR) ||
- kvm_vcpu_dabt_iss1tw(vcpu); /* AF/DBM update */
+ return kvm_vcpu_get_esr(vcpu) & ESR_ELx_WNR;
}
static inline bool kvm_vcpu_dabt_is_cm(const struct kvm_vcpu *vcpu)
return kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_IABT_LOW;
}
+static inline bool kvm_vcpu_trap_is_exec_fault(const struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_trap_is_iabt(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu);
+}
+
static __always_inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu)
{
return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC;
static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
{
+ if (kvm_vcpu_abt_iss1tw(vcpu))
+ return true;
+
if (kvm_vcpu_trap_is_iabt(vcpu))
return false;
kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT &&
kvm_vcpu_dabt_isvalid(vcpu) &&
!kvm_vcpu_abt_issea(vcpu) &&
- !kvm_vcpu_dabt_iss1tw(vcpu);
+ !kvm_vcpu_abt_iss1tw(vcpu);
if (valid) {
int ret = __vgic_v2_perform_cpuif_access(vcpu);
struct kvm_pgtable *pgt;
write_fault = kvm_is_write_fault(vcpu);
- exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
+ exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
VM_BUG_ON(write_fault && exec_fault);
if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
goto out;
}
- if (kvm_vcpu_dabt_iss1tw(vcpu)) {
+ if (kvm_vcpu_abt_iss1tw(vcpu)) {
kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
ret = 1;
goto out_unlock;
#define KVM_MIPS_GUEST_TLB_SIZE 64
struct kvm_vcpu_arch {
void *guest_ebase;
- int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu);
+ int (*vcpu_run)(struct kvm_vcpu *vcpu);
/* Host registers preserved across guest mode execution */
unsigned long host_stack;
/* Debug: dump vcpu state */
int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu);
-extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu);
+extern int kvm_mips_handle_exit(struct kvm_vcpu *vcpu);
/* Building of entry/exception code */
int kvm_mips_entry_setup(void);
* Assemble the start of the vcpu_run function to run a guest VCPU. The function
* conforms to the following prototype:
*
- * int vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu);
+ * int vcpu_run(struct kvm_vcpu *vcpu);
*
* The exit from the guest and return to the caller is handled by the code
* generated by kvm_mips_build_ret_to_host().
unsigned int i;
/*
- * A0: run
- * A1: vcpu
+ * A0: vcpu
*/
/* k0/k1 not being used in host kernel context */
kvm_mips_build_save_scratch(&p, V1, K1);
/* VCPU scratch register has pointer to vcpu */
- UASM_i_MTC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
+ UASM_i_MTC0(&p, A0, scratch_vcpu[0], scratch_vcpu[1]);
/* Offset into vcpu->arch */
- UASM_i_ADDIU(&p, K1, A1, offsetof(struct kvm_vcpu, arch));
+ UASM_i_ADDIU(&p, K1, A0, offsetof(struct kvm_vcpu, arch));
/*
* Save the host stack to VCPU, used for exception processing
/* Now that context has been saved, we can use other registers */
/* Restore vcpu */
- UASM_i_MFC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]);
-
- /* Restore run (vcpu->run) */
- UASM_i_LW(&p, S0, offsetof(struct kvm_vcpu, run), S1);
+ UASM_i_MFC0(&p, S0, scratch_vcpu[0], scratch_vcpu[1]);
/*
* Save Host level EPC, BadVaddr and Cause to VCPU, useful to process
* with this in the kernel
*/
uasm_i_move(&p, A0, S0);
- uasm_i_move(&p, A1, S1);
UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit);
uasm_i_jalr(&p, RA, T9);
UASM_i_ADDIU(&p, SP, SP, -CALLFRAME_SIZ);
* guest, reload k1
*/
- uasm_i_move(&p, K1, S1);
+ uasm_i_move(&p, K1, S0);
UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
/*
{
u32 *p = addr;
- /* Put the saved pointer to vcpu (s1) back into the scratch register */
- UASM_i_MTC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]);
+ /* Put the saved pointer to vcpu (s0) back into the scratch register */
+ UASM_i_MTC0(&p, S0, scratch_vcpu[0], scratch_vcpu[1]);
/* Load up the Guest EBASE to minimize the window where BEV is set */
UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1);
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
switch (type) {
+ case KVM_VM_MIPS_AUTO:
+ break;
#ifdef CONFIG_KVM_MIPS_VZ
case KVM_VM_MIPS_VZ:
#else
/*
* Return value is in the form (errcode<<2 | RESUME_FLAG_HOST | RESUME_FLAG_NV)
*/
-int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
+int kvm_mips_handle_exit(struct kvm_vcpu *vcpu)
{
+ struct kvm_run *run = vcpu->run;
u32 cause = vcpu->arch.host_cp0_cause;
u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
u32 __user *opc = (u32 __user *) vcpu->arch.pc;
*/
kvm_mips_suspend_mm(cpu);
- r = vcpu->arch.vcpu_run(vcpu->run, vcpu);
+ r = vcpu->arch.vcpu_run(vcpu);
/* We may have migrated while handling guest exits */
cpu = smp_processor_id();
kvm_vz_vcpu_load_tlb(vcpu, cpu);
kvm_vz_vcpu_load_wired(vcpu);
- r = vcpu->arch.vcpu_run(vcpu->run, vcpu);
+ r = vcpu->arch.vcpu_run(vcpu);
kvm_vz_vcpu_save_wired(vcpu);
#endif
#ifdef CONFIG_KVM_XICS
struct kvmppc_xics *xics;
+ struct kvmppc_xics *xics_device;
struct kvmppc_xive *xive; /* Current XIVE device in use */
struct {
struct kvmppc_xive *native;
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
#ifdef CONFIG_KVM_XICS
/*
- * Free the XIVE devices which are not directly freed by the
+ * Free the XIVE and XICS devices which are not directly freed by the
* device 'release' method
*/
kfree(kvm->arch.xive_devices.native);
kvm->arch.xive_devices.native = NULL;
kfree(kvm->arch.xive_devices.xics_on_xive);
kvm->arch.xive_devices.xics_on_xive = NULL;
+ kfree(kvm->arch.xics_device);
+ kvm->arch.xics_device = NULL;
#endif /* CONFIG_KVM_XICS */
}
return __radix_pte_update(ptep, clr, set);
}
-void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
+static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
struct kvmppc_spapr_tce_table *siter;
struct mm_struct *mm = kvm->mm;
unsigned long npages, size = args->size;
- int ret = -ENOMEM;
+ int ret;
if (!args->size || args->page_shift < 12 || args->page_shift > 34 ||
(args->offset + args->size > (ULLONG_MAX >> args->page_shift)))
return ret;
}
-long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
+static long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
unsigned long entry, unsigned long ua,
enum dma_data_direction dir)
{
return ret;
}
-extern void iommu_tce_kill_rm(struct iommu_table *tbl,
+static void iommu_tce_kill_rm(struct iommu_table *tbl,
unsigned long entry, unsigned long pages)
{
if (tbl->it_ops->tce_kill)
unsigned long host_psscr = mfspr(SPRN_PSSCR);
unsigned long host_pidr = mfspr(SPRN_PID);
+ /*
+ * P8 and P9 suppress the HDEC exception when LPCR[HDICE] = 0,
+ * so set HDICE before writing HDEC.
+ */
+ mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr | LPCR_HDICE);
+ isync();
+
hdec = time_limit - mftb();
- if (hdec < 0)
+ if (hdec < 0) {
+ mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
+ isync();
return BOOK3S_INTERRUPT_HV_DECREMENTER;
+ }
mtspr(SPRN_HDEC, hdec);
if (vc->tb_offset) {
* Virtual-mode guest entry for POWER9 and later when the host and
* guest are both using the radix MMU. The LPIDR has already been set.
*/
-int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
+static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
unsigned long lpcr)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
dec = mfspr(SPRN_DEC);
tb = mftb();
- if (dec < 512)
+ if (dec < 0)
return BOOK3S_INTERRUPT_HV_DECREMENTER;
local_paca->kvm_hstate.dec_expires = dec + tb;
if (local_paca->kvm_hstate.dec_expires < time_limit)
case KVM_PPC_ALLOCATE_HTAB: {
u32 htab_order;
+ /* If we're a nested hypervisor, we currently only support radix */
+ if (kvmhv_on_pseries()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+
r = -EFAULT;
if (get_user(htab_order, (u32 __user *)argp))
break;
/*
* Put whatever is in the decrementer into the
* hypervisor decrementer.
+ * Because of a hardware deviation in P8 and P9,
+ * we need to set LPCR[HDICE] before writing HDEC.
*/
-BEGIN_FTR_SECTION
ld r5, HSTATE_KVM_VCORE(r13)
ld r6, VCORE_KVM(r5)
ld r9, KVM_HOST_LPCR(r6)
- andis. r9, r9, LPCR_LD@h
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+ ori r8, r9, LPCR_HDICE
+ mtspr SPRN_LPCR, r8
+ isync
+ andis. r0, r9, LPCR_LD@h
mfspr r8,SPRN_DEC
mftb r7
BEGIN_FTR_SECTION
kvmhv_set_nested_ptbl(gp);
}
-struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
+static struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
{
struct kvm_nested_guest *gp;
long shadow_lpid;
return ics_rm_eoi(vcpu, irq);
}
-unsigned long eoi_rc;
+static unsigned long eoi_rc;
static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
{
#endif
}
-void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr)
+static void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr)
{
u32 host_pvr;
return -ENXIO;
}
-static void kvmppc_xics_free(struct kvm_device *dev)
+/*
+ * Called when device fd is closed. kvm->lock is held.
+ */
+static void kvmppc_xics_release(struct kvm_device *dev)
{
struct kvmppc_xics *xics = dev->private;
int i;
struct kvm *kvm = xics->kvm;
+ struct kvm_vcpu *vcpu;
+
+ pr_devel("Releasing xics device\n");
+
+ /*
+ * Since this is the device release function, we know that
+ * userspace does not have any open fd referring to the
+ * device. Therefore there can not be any of the device
+ * attribute set/get functions being executed concurrently,
+ * and similarly, the connect_vcpu and set/clr_mapped
+ * functions also cannot be being executed.
+ */
debugfs_remove(xics->dentry);
+ /*
+ * We should clean up the vCPU interrupt presenters first.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ /*
+ * Take vcpu->mutex to ensure that no one_reg get/set ioctl
+ * (i.e. kvmppc_xics_[gs]et_icp) can be done concurrently.
+ * Holding the vcpu->mutex also means that execution is
+ * excluded for the vcpu until the ICP was freed. When the vcpu
+ * can execute again, vcpu->arch.icp and vcpu->arch.irq_type
+ * have been cleared and the vcpu will not be going into the
+ * XICS code anymore.
+ */
+ mutex_lock(&vcpu->mutex);
+ kvmppc_xics_free_icp(vcpu);
+ mutex_unlock(&vcpu->mutex);
+ }
+
if (kvm)
kvm->arch.xics = NULL;
- for (i = 0; i <= xics->max_icsid; i++)
+ for (i = 0; i <= xics->max_icsid; i++) {
kfree(xics->ics[i]);
- kfree(xics);
+ xics->ics[i] = NULL;
+ }
+ /*
+ * A reference of the kvmppc_xics pointer is now kept under
+ * the xics_device pointer of the machine for reuse. It is
+ * freed when the VM is destroyed for now until we fix all the
+ * execution paths.
+ */
kfree(dev);
}
+static struct kvmppc_xics *kvmppc_xics_get_device(struct kvm *kvm)
+{
+ struct kvmppc_xics **kvm_xics_device = &kvm->arch.xics_device;
+ struct kvmppc_xics *xics = *kvm_xics_device;
+
+ if (!xics) {
+ xics = kzalloc(sizeof(*xics), GFP_KERNEL);
+ *kvm_xics_device = xics;
+ } else {
+ memset(xics, 0, sizeof(*xics));
+ }
+
+ return xics;
+}
+
static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
{
struct kvmppc_xics *xics;
struct kvm *kvm = dev->kvm;
- int ret = 0;
- xics = kzalloc(sizeof(*xics), GFP_KERNEL);
+ pr_devel("Creating xics for partition\n");
+
+ /* Already there ? */
+ if (kvm->arch.xics)
+ return -EEXIST;
+
+ xics = kvmppc_xics_get_device(kvm);
if (!xics)
return -ENOMEM;
dev->private = xics;
xics->dev = dev;
xics->kvm = kvm;
-
- /* Already there ? */
- if (kvm->arch.xics)
- ret = -EEXIST;
- else
- kvm->arch.xics = xics;
-
- if (ret) {
- kfree(xics);
- return ret;
- }
+ kvm->arch.xics = xics;
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
if (cpu_has_feature(CPU_FTR_ARCH_206) &&
.name = "kvm-xics",
.create = kvmppc_xics_create,
.init = kvmppc_xics_init,
- .destroy = kvmppc_xics_free,
+ .release = kvmppc_xics_release,
.set_attr = xics_set_attr,
.get_attr = xics_get_attr,
.has_attr = xics_has_attr,
return -EPERM;
if (xics->kvm != vcpu->kvm)
return -EPERM;
- if (vcpu->arch.irq_type)
+ if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
return -EBUSY;
r = kvmppc_xics_create_icp(vcpu, xcpu);
return 0;
}
-static int xive_native_debug_open(struct inode *inode, struct file *file)
-{
- return single_open(file, xive_native_debug_show, inode->i_private);
-}
-
-static const struct file_operations xive_native_debug_fops = {
- .open = xive_native_debug_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(xive_native_debug);
static void xive_native_debugfs_init(struct kvmppc_xive *xive)
{
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
#define STATIC
#include <linux/decompress/mm.h>
+#define _SETUP
+#include <asm/setup.h> /* For COMMAND_LINE_SIZE */
+#undef _SETUP
+
#ifdef CONFIG_X86_5LEVEL
unsigned int __pgtable_l5_enabled;
unsigned int pgdir_shift __ro_after_init = 39;
static bool memmap_too_large;
-/* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */
-static unsigned long long mem_limit = ULLONG_MAX;
+/*
+ * Store memory limit: MAXMEM on 64-bit and KERNEL_IMAGE_SIZE on 32-bit.
+ * It may be reduced by "mem=nn[KMG]" or "memmap=nn[KMG]" command line options.
+ */
+static u64 mem_limit;
/* Number of immovable memory regions */
static int num_immovable_mem;
};
static int
-parse_memmap(char *p, unsigned long long *start, unsigned long long *size,
- enum parse_mode mode)
+parse_memmap(char *p, u64 *start, u64 *size, enum parse_mode mode)
{
char *oldp;
*/
*size = 0;
} else {
- unsigned long long flags;
+ u64 flags;
/*
* efi_fake_mem=nn@ss:attr the attr specifies
while (str && (i < MAX_MEMMAP_REGIONS)) {
int rc;
- unsigned long long start, size;
+ u64 start, size;
char *k = strchr(str, ',');
if (k)
if (start == 0) {
/* Store the specified memory limit if size > 0 */
- if (size > 0)
+ if (size > 0 && size < mem_limit)
mem_limit = size;
continue;
static void handle_mem_options(void)
{
char *args = (char *)get_cmd_line_ptr();
- size_t len = strlen((char *)args);
+ size_t len;
char *tmp_cmdline;
char *param, *val;
u64 mem_size;
- if (!strstr(args, "memmap=") && !strstr(args, "mem=") &&
- !strstr(args, "hugepages"))
+ if (!args)
return;
+ len = strnlen(args, COMMAND_LINE_SIZE-1);
tmp_cmdline = malloc(len + 1);
if (!tmp_cmdline)
error("Failed to allocate space for tmp_cmdline");
while (*args) {
args = next_arg(args, ¶m, &val);
/* Stop at -- */
- if (!val && strcmp(param, "--") == 0) {
- warn("Only '--' specified in cmdline");
- goto out;
- }
+ if (!val && strcmp(param, "--") == 0)
+ break;
if (!strcmp(param, "memmap")) {
mem_avoid_memmap(PARSE_MEMMAP, val);
- } else if (strstr(param, "hugepages")) {
+ } else if (IS_ENABLED(CONFIG_X86_64) && strstr(param, "hugepages")) {
parse_gb_huge_pages(param, val);
} else if (!strcmp(param, "mem")) {
char *p = val;
continue;
mem_size = memparse(p, &p);
if (mem_size == 0)
- goto out;
+ break;
- mem_limit = mem_size;
+ if (mem_size < mem_limit)
+ mem_limit = mem_size;
} else if (!strcmp(param, "efi_fake_mem")) {
mem_avoid_memmap(PARSE_EFI, val);
}
}
-out:
free(tmp_cmdline);
return;
}
/*
- * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T).
+ * In theory, KASLR can put the kernel anywhere in the range of [16M, MAXMEM)
+ * on 64-bit, and [16M, KERNEL_IMAGE_SIZE) on 32-bit.
+ *
* The mem_avoid array is used to store the ranges that need to be avoided
* when KASLR searches for an appropriate random address. We must avoid any
* regions that are unsafe to overlap with during decompression, and other
{
unsigned long init_size = boot_params->hdr.init_size;
u64 initrd_start, initrd_size;
- u64 cmd_line, cmd_line_size;
- char *ptr;
+ unsigned long cmd_line, cmd_line_size;
/*
* Avoid the region that is unsafe to overlap during
/* No need to set mapping for initrd, it will be handled in VO. */
/* Avoid kernel command line. */
- cmd_line = (u64)boot_params->ext_cmd_line_ptr << 32;
- cmd_line |= boot_params->hdr.cmd_line_ptr;
+ cmd_line = get_cmd_line_ptr();
/* Calculate size of cmd_line. */
- ptr = (char *)(unsigned long)cmd_line;
- for (cmd_line_size = 0; ptr[cmd_line_size++];)
- ;
- mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line;
- mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size;
- add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start,
- mem_avoid[MEM_AVOID_CMDLINE].size);
+ if (cmd_line) {
+ cmd_line_size = strnlen((char *)cmd_line, COMMAND_LINE_SIZE-1) + 1;
+ mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line;
+ mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size;
+ add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start,
+ mem_avoid[MEM_AVOID_CMDLINE].size);
+ }
/* Avoid boot parameters. */
mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params;
{
int i;
struct setup_data *ptr;
- unsigned long earliest = img->start + img->size;
+ u64 earliest = img->start + img->size;
bool is_overlapping = false;
for (i = 0; i < MEM_AVOID_MAX; i++) {
}
struct slot_area {
- unsigned long addr;
- int num;
+ u64 addr;
+ unsigned long num;
};
#define MAX_SLOT_AREA 100
static struct slot_area slot_areas[MAX_SLOT_AREA];
-
+static unsigned int slot_area_index;
static unsigned long slot_max;
-static unsigned long slot_area_index;
-
static void store_slot_info(struct mem_vector *region, unsigned long image_size)
{
struct slot_area slot_area;
return;
slot_area.addr = region->start;
- slot_area.num = (region->size - image_size) /
- CONFIG_PHYSICAL_ALIGN + 1;
+ slot_area.num = 1 + (region->size - image_size) / CONFIG_PHYSICAL_ALIGN;
- if (slot_area.num > 0) {
- slot_areas[slot_area_index++] = slot_area;
- slot_max += slot_area.num;
- }
+ slot_areas[slot_area_index++] = slot_area;
+ slot_max += slot_area.num;
}
/*
static void
process_gb_huge_pages(struct mem_vector *region, unsigned long image_size)
{
- unsigned long addr, size = 0;
+ u64 pud_start, pud_end;
+ unsigned long gb_huge_pages;
struct mem_vector tmp;
- int i = 0;
- if (!max_gb_huge_pages) {
+ if (!IS_ENABLED(CONFIG_X86_64) || !max_gb_huge_pages) {
store_slot_info(region, image_size);
return;
}
- addr = ALIGN(region->start, PUD_SIZE);
- /* Did we raise the address above the passed in memory entry? */
- if (addr < region->start + region->size)
- size = region->size - (addr - region->start);
-
- /* Check how many 1GB huge pages can be filtered out: */
- while (size > PUD_SIZE && max_gb_huge_pages) {
- size -= PUD_SIZE;
- max_gb_huge_pages--;
- i++;
- }
+ /* Are there any 1GB pages in the region? */
+ pud_start = ALIGN(region->start, PUD_SIZE);
+ pud_end = ALIGN_DOWN(region->start + region->size, PUD_SIZE);
/* No good 1GB huge pages found: */
- if (!i) {
+ if (pud_start >= pud_end) {
store_slot_info(region, image_size);
return;
}
- /*
- * Skip those 'i'*1GB good huge pages, and continue checking and
- * processing the remaining head or tail part of the passed region
- * if available.
- */
-
- if (addr >= region->start + image_size) {
+ /* Check if the head part of the region is usable. */
+ if (pud_start >= region->start + image_size) {
tmp.start = region->start;
- tmp.size = addr - region->start;
+ tmp.size = pud_start - region->start;
store_slot_info(&tmp, image_size);
}
- size = region->size - (addr - region->start) - i * PUD_SIZE;
- if (size >= image_size) {
- tmp.start = addr + i * PUD_SIZE;
- tmp.size = size;
+ /* Skip the good 1GB pages. */
+ gb_huge_pages = (pud_end - pud_start) >> PUD_SHIFT;
+ if (gb_huge_pages > max_gb_huge_pages) {
+ pud_end = pud_start + (max_gb_huge_pages << PUD_SHIFT);
+ max_gb_huge_pages = 0;
+ } else {
+ max_gb_huge_pages -= gb_huge_pages;
+ }
+
+ /* Check if the tail part of the region is usable. */
+ if (region->start + region->size >= pud_end + image_size) {
+ tmp.start = pud_end;
+ tmp.size = region->start + region->size - pud_end;
store_slot_info(&tmp, image_size);
}
}
-static unsigned long slots_fetch_random(void)
+static u64 slots_fetch_random(void)
{
unsigned long slot;
- int i;
+ unsigned int i;
/* Handle case of no slots stored. */
if (slot_max == 0)
slot -= slot_areas[i].num;
continue;
}
- return slot_areas[i].addr + slot * CONFIG_PHYSICAL_ALIGN;
+ return slot_areas[i].addr + ((u64)slot * CONFIG_PHYSICAL_ALIGN);
}
if (i == slot_area_index)
unsigned long image_size)
{
struct mem_vector region, overlap;
- unsigned long start_orig, end;
- struct mem_vector cur_entry;
-
- /* On 32-bit, ignore entries entirely above our maximum. */
- if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE)
- return;
+ u64 region_end;
- /* Ignore entries entirely below our minimum. */
- if (entry->start + entry->size < minimum)
- return;
-
- /* Ignore entries above memory limit */
- end = min(entry->size + entry->start, mem_limit);
- if (entry->start >= end)
- return;
- cur_entry.start = entry->start;
- cur_entry.size = end - entry->start;
-
- region.start = cur_entry.start;
- region.size = cur_entry.size;
+ /* Enforce minimum and memory limit. */
+ region.start = max_t(u64, entry->start, minimum);
+ region_end = min(entry->start + entry->size, mem_limit);
/* Give up if slot area array is full. */
while (slot_area_index < MAX_SLOT_AREA) {
- start_orig = region.start;
-
- /* Potentially raise address to minimum location. */
- if (region.start < minimum)
- region.start = minimum;
-
/* Potentially raise address to meet alignment needs. */
region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
/* Did we raise the address above the passed in memory entry? */
- if (region.start > cur_entry.start + cur_entry.size)
+ if (region.start > region_end)
return;
/* Reduce size by any delta from the original address. */
- region.size -= region.start - start_orig;
-
- /* On 32-bit, reduce region size to fit within max size. */
- if (IS_ENABLED(CONFIG_X86_32) &&
- region.start + region.size > KERNEL_IMAGE_SIZE)
- region.size = KERNEL_IMAGE_SIZE - region.start;
+ region.size = region_end - region.start;
/* Return if region can't contain decompressed kernel */
if (region.size < image_size)
}
/* Store beginning of region if holds at least image_size. */
- if (overlap.start > region.start + image_size) {
- struct mem_vector beginning;
-
- beginning.start = region.start;
- beginning.size = overlap.start - region.start;
- process_gb_huge_pages(&beginning, image_size);
+ if (overlap.start >= region.start + image_size) {
+ region.size = overlap.start - region.start;
+ process_gb_huge_pages(®ion, image_size);
}
- /* Return if overlap extends to or past end of region. */
- if (overlap.start + overlap.size >= region.start + region.size)
- return;
-
/* Clip off the overlapping region and start over. */
- region.size -= overlap.start - region.start + overlap.size;
region.start = overlap.start + overlap.size;
}
}
static bool process_mem_region(struct mem_vector *region,
- unsigned long long minimum,
- unsigned long long image_size)
+ unsigned long minimum,
+ unsigned long image_size)
{
int i;
/*
* immovable memory and @region.
*/
for (i = 0; i < num_immovable_mem; i++) {
- unsigned long long start, end, entry_end, region_end;
+ u64 start, end, entry_end, region_end;
struct mem_vector entry;
if (!mem_overlaps(region, &immovable_mem[i]))
#ifdef CONFIG_EFI
/*
- * Returns true if mirror region found (and must have been processed
- * for slots adding)
+ * Returns true if we processed the EFI memmap, which we prefer over the E820
+ * table if it is available.
*/
static bool
process_efi_entries(unsigned long minimum, unsigned long image_size)
static unsigned long find_random_phys_addr(unsigned long minimum,
unsigned long image_size)
{
+ u64 phys_addr;
+
+ /* Bail out early if it's impossible to succeed. */
+ if (minimum + image_size > mem_limit)
+ return 0;
+
/* Check if we had too many memmaps. */
if (memmap_too_large) {
debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n");
return 0;
}
- /* Make sure minimum is aligned. */
- minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
+ if (!process_efi_entries(minimum, image_size))
+ process_e820_entries(minimum, image_size);
- if (process_efi_entries(minimum, image_size))
- return slots_fetch_random();
+ phys_addr = slots_fetch_random();
- process_e820_entries(minimum, image_size);
- return slots_fetch_random();
+ /* Perform a final check to make sure the address is in range. */
+ if (phys_addr < minimum || phys_addr + image_size > mem_limit) {
+ warn("Invalid physical address chosen!\n");
+ return 0;
+ }
+
+ return (unsigned long)phys_addr;
}
static unsigned long find_random_virt_addr(unsigned long minimum,
{
unsigned long slots, random_addr;
- /* Make sure minimum is aligned. */
- minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
- /* Align image_size for easy slot calculations. */
- image_size = ALIGN(image_size, CONFIG_PHYSICAL_ALIGN);
-
/*
* There are how many CONFIG_PHYSICAL_ALIGN-sized slots
* that can hold image_size within the range of minimum to
* KERNEL_IMAGE_SIZE?
*/
- slots = (KERNEL_IMAGE_SIZE - minimum - image_size) /
- CONFIG_PHYSICAL_ALIGN + 1;
+ slots = 1 + (KERNEL_IMAGE_SIZE - minimum - image_size) / CONFIG_PHYSICAL_ALIGN;
random_addr = kaslr_get_random_long("Virtual") % slots;
/* Prepare to add new identity pagetables on demand. */
initialize_identity_maps();
+ if (IS_ENABLED(CONFIG_X86_32))
+ mem_limit = KERNEL_IMAGE_SIZE;
+ else
+ mem_limit = MAXMEM;
+
/* Record the various known unsafe memory ranges. */
mem_avoid_init(input, input_size, *output);
* location:
*/
min_addr = min(*output, 512UL << 20);
+ /* Make sure minimum is aligned. */
+ min_addr = ALIGN(min_addr, CONFIG_PHYSICAL_ALIGN);
/* Walk available memory entries to find a random address. */
random_addr = find_random_phys_addr(min_addr, output_size);
int cmdline_find_option_bool(const char *option);
struct mem_vector {
- unsigned long long start;
- unsigned long long size;
+ u64 start;
+ u64 size;
};
#if CONFIG_RANDOMIZE_BASE
#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */
#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */
#define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */
+#define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */
#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
#define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */
#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
{
u32 eax, edx;
- asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
- : "=a" (eax), "=d" (edx)
- : "c" (index));
+ asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index));
return eax + ((u64)edx << 32);
}
u32 eax = value;
u32 edx = value >> 32;
- asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
- : : "a" (eax), "d" (edx), "c" (index));
+ asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index));
}
#endif /* _ASM_X86_FPU_INTERNAL_H */
#define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21)
#define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22)
#define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23)
-#define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24)
+#define KVM_REQ_GET_NESTED_STATE_PAGES KVM_ARCH_REQ(24)
#define KVM_REQ_APICV_UPDATE \
KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26)
#define KVM_REQ_HV_TLB_FLUSH \
KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_APF_READY KVM_ARCH_REQ(28)
+#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
struct kvm_hv_syndbg hv_syndbg;
};
+struct msr_bitmap_range {
+ u32 flags;
+ u32 nmsrs;
+ u32 base;
+ unsigned long *bitmap;
+};
+
enum kvm_irqchip_mode {
KVM_IRQCHIP_NONE,
KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */
bool guest_can_read_msr_platform_info;
bool exception_payload_enabled;
+ /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */
+ u32 user_space_msr_mask;
+
+ struct {
+ u8 count;
+ bool default_allow:1;
+ struct msr_bitmap_range ranges[16];
+ } msr_filter;
+
struct kvm_pmu_event_filter *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
};
/* Returns actual tsc_offset set in active VMCS */
u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
- void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
+ /*
+ * Retrieve somewhat arbitrary exit information. Intended to be used
+ * only from within tracepoints to avoid VMREADs when tracing is off.
+ */
+ void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
+ u32 *exit_int_info, u32 *exit_int_info_err_code);
int (*check_intercept)(struct kvm_vcpu *vcpu,
struct x86_instruction_info *info,
int (*get_msr_feature)(struct kvm_msr_entry *entry);
- bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu);
+ bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, void *insn, int insn_len);
bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
void (*migrate_timers)(struct kvm_vcpu *vcpu);
+ void (*msr_filter_changed)(struct kvm_vcpu *vcpu);
};
struct kvm_x86_nested_ops {
int (*set_state)(struct kvm_vcpu *vcpu,
struct kvm_nested_state __user *user_kvm_nested_state,
struct kvm_nested_state *kvm_state);
- bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
+ bool (*get_nested_state_pages)(struct kvm_vcpu *vcpu);
int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
int (*enable_evmcs)(struct kvm_vcpu *vcpu,
unsigned long ipi_bitmap_high, u32 min,
unsigned long icr, int op_64_bit);
-void kvm_define_shared_msr(unsigned index, u32 msr);
-int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
+void kvm_define_user_return_msr(unsigned index, u32 msr);
+int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc);
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
#define nop() asm volatile ("nop")
+static inline void serialize(void)
+{
+ /* Instruction opcode for SERIALIZE; supported in binutils >= 2.35. */
+ asm volatile(".byte 0xf, 0x1, 0xe8" ::: "memory");
+}
+
#endif /* __KERNEL__ */
#endif /* _ASM_X86_SPECIAL_INSNS_H */
#define __SVM_H
#include <uapi/asm/svm.h>
-
+#include <uapi/asm/kvm.h>
+
+/*
+ * 32-bit intercept words in the VMCB Control Area, starting
+ * at Byte offset 000h.
+ */
+
+enum intercept_words {
+ INTERCEPT_CR = 0,
+ INTERCEPT_DR,
+ INTERCEPT_EXCEPTION,
+ INTERCEPT_WORD3,
+ INTERCEPT_WORD4,
+ INTERCEPT_WORD5,
+ MAX_INTERCEPT,
+};
enum {
- INTERCEPT_INTR,
+ /* Byte offset 000h (word 0) */
+ INTERCEPT_CR0_READ = 0,
+ INTERCEPT_CR3_READ = 3,
+ INTERCEPT_CR4_READ = 4,
+ INTERCEPT_CR8_READ = 8,
+ INTERCEPT_CR0_WRITE = 16,
+ INTERCEPT_CR3_WRITE = 16 + 3,
+ INTERCEPT_CR4_WRITE = 16 + 4,
+ INTERCEPT_CR8_WRITE = 16 + 8,
+ /* Byte offset 004h (word 1) */
+ INTERCEPT_DR0_READ = 32,
+ INTERCEPT_DR1_READ,
+ INTERCEPT_DR2_READ,
+ INTERCEPT_DR3_READ,
+ INTERCEPT_DR4_READ,
+ INTERCEPT_DR5_READ,
+ INTERCEPT_DR6_READ,
+ INTERCEPT_DR7_READ,
+ INTERCEPT_DR0_WRITE = 48,
+ INTERCEPT_DR1_WRITE,
+ INTERCEPT_DR2_WRITE,
+ INTERCEPT_DR3_WRITE,
+ INTERCEPT_DR4_WRITE,
+ INTERCEPT_DR5_WRITE,
+ INTERCEPT_DR6_WRITE,
+ INTERCEPT_DR7_WRITE,
+ /* Byte offset 008h (word 2) */
+ INTERCEPT_EXCEPTION_OFFSET = 64,
+ /* Byte offset 00Ch (word 3) */
+ INTERCEPT_INTR = 96,
INTERCEPT_NMI,
INTERCEPT_SMI,
INTERCEPT_INIT,
INTERCEPT_TASK_SWITCH,
INTERCEPT_FERR_FREEZE,
INTERCEPT_SHUTDOWN,
- INTERCEPT_VMRUN,
+ /* Byte offset 010h (word 4) */
+ INTERCEPT_VMRUN = 128,
INTERCEPT_VMMCALL,
INTERCEPT_VMLOAD,
INTERCEPT_VMSAVE,
INTERCEPT_MWAIT_COND,
INTERCEPT_XSETBV,
INTERCEPT_RDPRU,
+ /* Byte offset 014h (word 5) */
+ INTERCEPT_INVLPGB = 160,
+ INTERCEPT_INVLPGB_ILLEGAL,
+ INTERCEPT_INVPCID,
+ INTERCEPT_MCOMMIT,
+ INTERCEPT_TLBSYNC,
};
struct __attribute__ ((__packed__)) vmcb_control_area {
- u32 intercept_cr;
- u32 intercept_dr;
- u32 intercept_exceptions;
- u64 intercept;
- u8 reserved_1[40];
+ u32 intercepts[MAX_INTERCEPT];
+ u32 reserved_1[15 - MAX_INTERCEPT];
u16 pause_filter_thresh;
u16 pause_filter_count;
u64 iopm_base_pa;
#define SVM_NESTED_CTL_NP_ENABLE BIT(0)
#define SVM_NESTED_CTL_SEV_ENABLE BIT(1)
-struct __attribute__ ((__packed__)) vmcb_seg {
+struct vmcb_seg {
u16 selector;
u16 attrib;
u32 limit;
u64 base;
-};
+} __packed;
-struct __attribute__ ((__packed__)) vmcb_save_area {
+struct vmcb_save_area {
struct vmcb_seg es;
struct vmcb_seg cs;
struct vmcb_seg ss;
u64 br_to;
u64 last_excp_from;
u64 last_excp_to;
-};
+ /*
+ * The following part of the save area is valid only for
+ * SEV-ES guests when referenced through the GHCB.
+ */
+ u8 reserved_7[104];
+ u64 reserved_8; /* rax already available at 0x01f8 */
+ u64 rcx;
+ u64 rdx;
+ u64 rbx;
+ u64 reserved_9; /* rsp already available at 0x01d8 */
+ u64 rbp;
+ u64 rsi;
+ u64 rdi;
+ u64 r8;
+ u64 r9;
+ u64 r10;
+ u64 r11;
+ u64 r12;
+ u64 r13;
+ u64 r14;
+ u64 r15;
+ u8 reserved_10[16];
+ u64 sw_exit_code;
+ u64 sw_exit_info_1;
+ u64 sw_exit_info_2;
+ u64 sw_scratch;
+ u8 reserved_11[56];
+ u64 xcr0;
+ u8 valid_bitmap[16];
+ u64 x87_state_gpa;
+} __packed;
+
+struct ghcb {
+ struct vmcb_save_area save;
+ u8 reserved_save[2048 - sizeof(struct vmcb_save_area)];
+
+ u8 shared_buffer[2032];
+
+ u8 reserved_1[10];
+ u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */
+ u32 ghcb_usage;
+} __packed;
+
+
+#define EXPECTED_VMCB_SAVE_AREA_SIZE 1032
+#define EXPECTED_VMCB_CONTROL_AREA_SIZE 256
+#define EXPECTED_GHCB_SIZE PAGE_SIZE
static inline void __unused_size_checks(void)
{
- BUILD_BUG_ON(sizeof(struct vmcb_save_area) != 0x298);
- BUILD_BUG_ON(sizeof(struct vmcb_control_area) != 256);
+ BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE);
+ BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE);
+ BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE);
}
-struct __attribute__ ((__packed__)) vmcb {
+struct vmcb {
struct vmcb_control_area control;
u8 reserved_control[1024 - sizeof(struct vmcb_control_area)];
struct vmcb_save_area save;
-};
+} __packed;
#define SVM_CPUID_FUNC 0x8000000a
#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
#define SVM_SELECTOR_CODE_MASK (1 << 3)
-#define INTERCEPT_CR0_READ 0
-#define INTERCEPT_CR3_READ 3
-#define INTERCEPT_CR4_READ 4
-#define INTERCEPT_CR8_READ 8
-#define INTERCEPT_CR0_WRITE (16 + 0)
-#define INTERCEPT_CR3_WRITE (16 + 3)
-#define INTERCEPT_CR4_WRITE (16 + 4)
-#define INTERCEPT_CR8_WRITE (16 + 8)
-
-#define INTERCEPT_DR0_READ 0
-#define INTERCEPT_DR1_READ 1
-#define INTERCEPT_DR2_READ 2
-#define INTERCEPT_DR3_READ 3
-#define INTERCEPT_DR4_READ 4
-#define INTERCEPT_DR5_READ 5
-#define INTERCEPT_DR6_READ 6
-#define INTERCEPT_DR7_READ 7
-#define INTERCEPT_DR0_WRITE (16 + 0)
-#define INTERCEPT_DR1_WRITE (16 + 1)
-#define INTERCEPT_DR2_WRITE (16 + 2)
-#define INTERCEPT_DR3_WRITE (16 + 3)
-#define INTERCEPT_DR4_WRITE (16 + 4)
-#define INTERCEPT_DR5_WRITE (16 + 5)
-#define INTERCEPT_DR6_WRITE (16 + 6)
-#define INTERCEPT_DR7_WRITE (16 + 7)
-
#define SVM_EVTINJ_VEC_MASK 0xff
#define SVM_EVTINJ_TYPE_SHIFT 8
#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)
+/* GHCB Accessor functions */
+
+#define GHCB_BITMAP_IDX(field) \
+ (offsetof(struct vmcb_save_area, field) / sizeof(u64))
+
+#define DEFINE_GHCB_ACCESSORS(field) \
+ static inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \
+ { \
+ return test_bit(GHCB_BITMAP_IDX(field), \
+ (unsigned long *)&ghcb->save.valid_bitmap); \
+ } \
+ \
+ static inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \
+ { \
+ __set_bit(GHCB_BITMAP_IDX(field), \
+ (unsigned long *)&ghcb->save.valid_bitmap); \
+ ghcb->save.field = value; \
+ }
+
+DEFINE_GHCB_ACCESSORS(cpl)
+DEFINE_GHCB_ACCESSORS(rip)
+DEFINE_GHCB_ACCESSORS(rsp)
+DEFINE_GHCB_ACCESSORS(rax)
+DEFINE_GHCB_ACCESSORS(rcx)
+DEFINE_GHCB_ACCESSORS(rdx)
+DEFINE_GHCB_ACCESSORS(rbx)
+DEFINE_GHCB_ACCESSORS(rbp)
+DEFINE_GHCB_ACCESSORS(rsi)
+DEFINE_GHCB_ACCESSORS(rdi)
+DEFINE_GHCB_ACCESSORS(r8)
+DEFINE_GHCB_ACCESSORS(r9)
+DEFINE_GHCB_ACCESSORS(r10)
+DEFINE_GHCB_ACCESSORS(r11)
+DEFINE_GHCB_ACCESSORS(r12)
+DEFINE_GHCB_ACCESSORS(r13)
+DEFINE_GHCB_ACCESSORS(r14)
+DEFINE_GHCB_ACCESSORS(r15)
+DEFINE_GHCB_ACCESSORS(sw_exit_code)
+DEFINE_GHCB_ACCESSORS(sw_exit_info_1)
+DEFINE_GHCB_ACCESSORS(sw_exit_info_2)
+DEFINE_GHCB_ACCESSORS(sw_scratch)
+DEFINE_GHCB_ACCESSORS(xcr0)
+
#endif
#include <linux/preempt.h>
#include <asm/processor.h>
#include <asm/cpufeature.h>
+#include <asm/special_insns.h>
#ifdef CONFIG_X86_32
static inline void iret_to_self(void)
*
* b) Text was modified on a different CPU, may subsequently be
* executed on this CPU, and you want to make sure the new version
- * gets executed. This generally means you're calling this in a IPI.
+ * gets executed. This generally means you're calling this in an IPI.
*
* If you're calling this for a different reason, you're probably doing
* it wrong.
+ *
+ * Like all of Linux's memory ordering operations, this is a
+ * compiler barrier as well.
*/
static inline void sync_core(void)
{
/*
- * There are quite a few ways to do this. IRET-to-self is nice
- * because it works on every CPU, at any CPL (so it's compatible
- * with paravirtualization), and it never exits to a hypervisor.
- * The only down sides are that it's a bit slow (it seems to be
- * a bit more than 2x slower than the fastest options) and that
- * it unmasks NMIs. The "push %cs" is needed because, in
- * paravirtual environments, __KERNEL_CS may not be a valid CS
- * value when we do IRET directly.
+ * The SERIALIZE instruction is the most straightforward way to
+ * do this, but it is not universally available.
+ */
+ if (static_cpu_has(X86_FEATURE_SERIALIZE)) {
+ serialize();
+ return;
+ }
+
+ /*
+ * For all other processors, there are quite a few ways to do this.
+ * IRET-to-self is nice because it works on every CPU, at any CPL
+ * (so it's compatible with paravirtualization), and it never exits
+ * to a hypervisor. The only downsides are that it's a bit slow
+ * (it seems to be a bit more than 2x slower than the fastest
+ * options) and that it unmasks NMIs. The "push %cs" is needed,
+ * because in paravirtual environments __KERNEL_CS may not be a
+ * valid CS value when we do IRET directly.
*
* In case NMI unmasking or performance ever becomes a problem,
* the next best option appears to be MOV-to-CR2 and an
* CPUID is the conventional way, but it's nasty: it doesn't
* exist on some 486-like CPUs, and it usually exits to a
* hypervisor.
- *
- * Like all of Linux's memory ordering operations, this is a
- * compiler barrier as well.
*/
iret_to_self();
}
#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES VMCS_CONTROL_BIT(VIRT_APIC_ACCESSES)
#define SECONDARY_EXEC_ENABLE_EPT VMCS_CONTROL_BIT(EPT)
#define SECONDARY_EXEC_DESC VMCS_CONTROL_BIT(DESC_EXITING)
-#define SECONDARY_EXEC_RDTSCP VMCS_CONTROL_BIT(RDTSCP)
+#define SECONDARY_EXEC_ENABLE_RDTSCP VMCS_CONTROL_BIT(RDTSCP)
#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE VMCS_CONTROL_BIT(VIRTUAL_X2APIC)
#define SECONDARY_EXEC_ENABLE_VPID VMCS_CONTROL_BIT(VPID)
#define SECONDARY_EXEC_WBINVD_EXITING VMCS_CONTROL_BIT(WBINVD_EXITING)
__u32 indices[0];
};
+/* Maximum size of any access bitmap in bytes */
+#define KVM_MSR_FILTER_MAX_BITMAP_SIZE 0x600
+
+/* for KVM_X86_SET_MSR_FILTER */
+struct kvm_msr_filter_range {
+#define KVM_MSR_FILTER_READ (1 << 0)
+#define KVM_MSR_FILTER_WRITE (1 << 1)
+ __u32 flags;
+ __u32 nmsrs; /* number of msrs in bitmap */
+ __u32 base; /* MSR index the bitmap starts at */
+ __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */
+};
+
+#define KVM_MSR_FILTER_MAX_RANGES 16
+struct kvm_msr_filter {
+#define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0)
+#define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0)
+ __u32 flags;
+ struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES];
+};
struct kvm_cpuid_entry {
__u32 function;
#define SVM_EXIT_MWAIT_COND 0x08c
#define SVM_EXIT_XSETBV 0x08d
#define SVM_EXIT_RDPRU 0x08e
+#define SVM_EXIT_INVPCID 0x0a2
#define SVM_EXIT_NPF 0x400
#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401
#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402
{ SVM_EXIT_MONITOR, "monitor" }, \
{ SVM_EXIT_MWAIT, "mwait" }, \
{ SVM_EXIT_XSETBV, "xsetbv" }, \
+ { SVM_EXIT_INVPCID, "invpcid" }, \
{ SVM_EXIT_NPF, "npf" }, \
{ SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \
{ SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \
{
struct pt_regs *old_regs = set_irq_regs(regs);
u32 token;
- irqentry_state_t state;
- state = irqentry_enter(regs);
+ ack_APIC_irq();
inc_irq_stat(irq_hv_callback_count);
wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1);
}
- irqentry_exit(regs, state);
set_irq_regs(old_regs);
}
if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
return;
- /* Enable guest halt poll disables host halt poll */
+ /* Disable guest halt poll enables host halt poll */
smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1);
}
EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
* value that, lies close to the top of the kernel memory. The limit for the GDT
* and the IDT are set to zero.
*
- * Given that SLDT and STR are not commonly used in programs that run on WineHQ
- * or DOSEMU2, they are not emulated.
- *
- * The instruction smsw is emulated to return the value that the register CR0
+ * The instruction SMSW is emulated to return the value that the register CR0
* has at boot time as set in the head_32.
+ * SLDT and STR are emulated to return the values that the kernel programmatically
+ * assigns:
+ * - SLDT returns (GDT_ENTRY_LDT * 8) if an LDT has been set, 0 if not.
+ * - STR returns (GDT_ENTRY_TSS * 8).
*
* Emulation is provided for both 32-bit and 64-bit processes.
*
*data_size += UMIP_GDT_IDT_LIMIT_SIZE;
memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE);
- } else if (umip_inst == UMIP_INST_SMSW) {
- unsigned long dummy_value = CR0_STATE;
+ } else if (umip_inst == UMIP_INST_SMSW || umip_inst == UMIP_INST_SLDT ||
+ umip_inst == UMIP_INST_STR) {
+ unsigned long dummy_value;
+
+ if (umip_inst == UMIP_INST_SMSW) {
+ dummy_value = CR0_STATE;
+ } else if (umip_inst == UMIP_INST_STR) {
+ dummy_value = GDT_ENTRY_TSS * 8;
+ } else if (umip_inst == UMIP_INST_SLDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ down_read(¤t->mm->context.ldt_usr_sem);
+ if (current->mm->context.ldt)
+ dummy_value = GDT_ENTRY_LDT * 8;
+ else
+ dummy_value = 0;
+ up_read(¤t->mm->context.ldt_usr_sem);
+#else
+ dummy_value = 0;
+#endif
+ }
/*
- * Even though the CR0 register has 4 bytes, the number
+ * For these 3 instructions, the number
* of bytes to be copied in the result buffer is determined
* by whether the operand is a register or a memory location.
* If operand is a register, return as many bytes as the operand
* size. If operand is memory, return only the two least
- * siginificant bytes of CR0.
+ * siginificant bytes.
*/
if (X86_MODRM_MOD(insn->modrm.value) == 3)
*data_size = insn->opnd_bytes;
*data_size = 2;
memcpy(data, &dummy_value, *data_size);
- /* STR and SLDT are not emulated */
} else {
return -EINVAL;
}
umip_pr_warn(regs, "%s instruction cannot be used by applications.\n",
umip_insns[umip_inst]);
- /* Do not emulate (spoof) SLDT or STR. */
- if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT)
- return false;
-
umip_pr_warn(regs, "For now, expensive software emulation returns the result.\n");
if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size,
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
-kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o
+kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
+ vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
obj-$(CONFIG_KVM) += kvm.o
not_found:
return 36;
}
-EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr);
/* when an old userspace process fills a new kernel module */
int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
- F(SERIALIZE)
+ F(SERIALIZE) | F(TSXLDTRK)
);
/* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
return vcpu->arch.maxphyaddr;
}
+static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu)));
+}
+
struct cpuid_reg {
u32 function;
u32 index;
*reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4);
val = GET_SMSTATE(u32, smstate, 0x7fcc);
- ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+
+ if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1))
+ return X86EMUL_UNHANDLEABLE;
+
val = GET_SMSTATE(u32, smstate, 0x7fc8);
- ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+ if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1))
+ return X86EMUL_UNHANDLEABLE;
selector = GET_SMSTATE(u32, smstate, 0x7fc4);
set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f64));
ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED;
val = GET_SMSTATE(u32, smstate, 0x7f68);
- ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+
+ if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1))
+ return X86EMUL_UNHANDLEABLE;
+
val = GET_SMSTATE(u32, smstate, 0x7f60);
- ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+ if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1))
+ return X86EMUL_UNHANDLEABLE;
cr0 = GET_SMSTATE(u64, smstate, 0x7f58);
cr3 = GET_SMSTATE(u64, smstate, 0x7f50);
cr4 = GET_SMSTATE(u64, smstate, 0x7f48);
ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7f00));
val = GET_SMSTATE(u64, smstate, 0x7ed0);
- ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA);
+
+ if (ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA))
+ return X86EMUL_UNHANDLEABLE;
selector = GET_SMSTATE(u32, smstate, 0x7e90);
rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e92) << 8);
u64 tsc_aux = 0;
if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux))
- return emulate_gp(ctxt, 0);
+ return emulate_ud(ctxt);
ctxt->dst.val = tsc_aux;
return X86EMUL_CONTINUE;
}
static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
{
+ u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX);
u64 msr_data;
+ int r;
msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
| ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
- if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data))
+ r = ctxt->ops->set_msr(ctxt, msr_index, msr_data);
+
+ if (r == X86EMUL_IO_NEEDED)
+ return r;
+
+ if (r)
return emulate_gp(ctxt, 0);
return X86EMUL_CONTINUE;
static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
{
+ u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX);
u64 msr_data;
+ int r;
+
+ r = ctxt->ops->get_msr(ctxt, msr_index, &msr_data);
+
+ if (r == X86EMUL_IO_NEEDED)
+ return r;
- if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data))
+ if (r)
return emulate_gp(ctxt, 0);
*reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
{
union hv_stimer_config new_config = {.as_uint64 = config},
old_config = {.as_uint64 = stimer->config.as_uint64};
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+
+ if (!synic->active && !host)
+ return 1;
trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id,
stimer->index, config, host);
static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
bool host)
{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+
+ if (!synic->active && !host)
+ return 1;
+
trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id,
stimer->index, count, host);
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
+static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val)
+{
+ kvm_lapic_set_reg(apic, APIC_DFR, val);
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+}
+
static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
{
return ((id >> 4) << 16) | (1 << (id & 0xf));
}
}
+void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec)
+{
+ apic_clear_irr(vec, vcpu->arch.apic);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_clear_irr);
+
static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
{
struct kvm_vcpu *vcpu;
struct kvm_lapic *apic = vcpu->arch.apic;
u64 guest_tsc, tsc_deadline;
- if (apic->lapic_timer.expired_tscdeadline == 0)
- return;
-
tsc_deadline = apic->lapic_timer.expired_tscdeadline;
apic->lapic_timer.expired_tscdeadline = 0;
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
{
- if (lapic_timer_int_injected(vcpu))
+ if (lapic_in_kernel(vcpu) &&
+ vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
+ vcpu->arch.apic->lapic_timer.timer_advance_ns &&
+ lapic_timer_int_injected(vcpu))
__kvm_wait_lapic_expire(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire);
}
if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
- if (apic->lapic_timer.timer_advance_ns)
- __kvm_wait_lapic_expire(vcpu);
+ kvm_wait_lapic_expire(vcpu);
kvm_apic_inject_pending_timer_irqs(apic);
return;
}
atomic_inc(&apic->lapic_timer.pending);
- kvm_set_pending_timer(vcpu);
+ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+ if (from_timer_fn)
+ kvm_vcpu_kick(vcpu);
}
static void start_sw_tscdeadline(struct kvm_lapic *apic)
break;
case APIC_DFR:
- if (!apic_x2apic_mode(apic)) {
- kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
- atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
- } else
+ if (!apic_x2apic_mode(apic))
+ kvm_apic_set_dfr(apic, val | 0x0FFFFFFF);
+ else
ret = 1;
break;
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!lapic_in_kernel(vcpu) ||
- !apic_lvtt_tscdeadline(apic))
+ if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
return 0;
return apic->lapic_timer.tscdeadline;
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) ||
- apic_lvtt_period(apic))
+ if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
return;
hrtimer_cancel(&apic->lapic_timer.timer);
SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
- kvm_lapic_set_reg(apic, APIC_DFR, 0xffffffffU);
+ kvm_apic_set_dfr(apic, 0xffffffffU);
apic_set_spiv(apic, 0xff);
kvm_lapic_set_reg(apic, APIC_TASKPRI, 0);
if (!apic_x2apic_mode(apic))
__apic_update_ppr(apic, &ppr);
return apic_has_interrupt_for_ppr(apic, ppr);
}
+EXPORT_SYMBOL_GPL(kvm_apic_has_interrupt);
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
{
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int shorthand, unsigned int dest, int dest_mode);
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
+void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec);
bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr);
bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr);
void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
}
-static inline bool kvm_mmu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
- return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu)));
-}
-
/*
* Check if a given access (described through the I/D, W/R and U/S bits of a
* page fault error code pfec) causes a permission fault with the given PTE
#define PTE_LIST_EXT 3
/*
- * Return values of handle_mmio_page_fault and mmu.page_fault:
+ * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault().
+ *
* RET_PF_RETRY: let CPU fault again on the address.
* RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
- *
- * For handle_mmio_page_fault only:
* RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
+ * RET_PF_FIXED: The faulting entry has been fixed.
+ * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
*/
enum {
RET_PF_RETRY = 0,
- RET_PF_EMULATE = 1,
- RET_PF_INVALID = 2,
+ RET_PF_EMULATE,
+ RET_PF_INVALID,
+ RET_PF_FIXED,
+ RET_PF_SPURIOUS,
};
struct pte_list_desc {
struct x86_exception *exception)
{
/* Check if guest physical address doesn't exceed guest maximum */
- if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) {
+ if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) {
exception->error_code |= PFERR_RSVD_MASK;
return UNMAPPED_GVA;
}
}
if (sp->unsync_children)
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
__clear_sp_write_flooding_count(sp);
}
}
-static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
- u64 *spte)
+/* Returns the number of zapped non-leaf child shadow pages. */
+static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ u64 *spte, struct list_head *invalid_list)
{
u64 pte;
struct kvm_mmu_page *child;
} else {
child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, spte);
- }
- return true;
- }
- if (is_mmio_spte(pte))
+ /*
+ * Recursively zap nested TDP SPs, parentless SPs are
+ * unlikely to be used again in the near future. This
+ * avoids retaining a large number of stale nested SPs.
+ */
+ if (tdp_enabled && invalid_list &&
+ child->role.guest_mode && !child->parent_ptes.val)
+ return kvm_mmu_prepare_zap_page(kvm, child,
+ invalid_list);
+ }
+ } else if (is_mmio_spte(pte)) {
mmu_spte_clear_no_track(spte);
-
- return false;
+ }
+ return 0;
}
-static void kvm_mmu_page_unlink_children(struct kvm *kvm,
- struct kvm_mmu_page *sp)
+static int kvm_mmu_page_unlink_children(struct kvm *kvm,
+ struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
{
+ int zapped = 0;
unsigned i;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
- mmu_page_zap_pte(kvm, sp, sp->spt + i);
+ zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
+
+ return zapped;
}
static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
trace_kvm_mmu_prepare_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
- kvm_mmu_page_unlink_children(kvm, sp);
+ *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
kvm_mmu_unlink_parents(kvm, sp);
/* Zapping children means active_mmu_pages has become unstable. */
/* Bits which may be returned by set_spte() */
#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
+#define SET_SPTE_SPURIOUS BIT(2)
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned int pte_access, int level,
spte = mark_spte_for_access_track(spte);
set_pte:
- if (mmu_spte_update(sptep, spte))
+ if (*sptep == spte)
+ ret |= SET_SPTE_SPURIOUS;
+ else if (mmu_spte_update(sptep, spte))
ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
return ret;
}
static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned int pte_access, int write_fault, int level,
+ unsigned int pte_access, bool write_fault, int level,
gfn_t gfn, kvm_pfn_t pfn, bool speculative,
bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
int set_spte_ret;
- int ret = RET_PF_RETRY;
+ int ret = RET_PF_FIXED;
bool flush = false;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
if (unlikely(is_mmio_spte(*sptep)))
ret = RET_PF_EMULATE;
+ /*
+ * The fault is fully spurious if and only if the new SPTE and old SPTE
+ * are identical, and emulation is not required.
+ */
+ if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) {
+ WARN_ON_ONCE(!was_rmapped);
+ return RET_PF_SPURIOUS;
+ }
+
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
trace_kvm_mmu_set_spte(level, gfn, sptep);
if (!was_rmapped && is_large_pte(*sptep))
return -1;
for (i = 0; i < ret; i++, gfn++, start++) {
- mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
+ mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn,
page_to_pfn(pages[i]), true, true);
put_page(pages[i]);
}
}
static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
- int max_level, kvm_pfn_t *pfnp)
+ int max_level, kvm_pfn_t *pfnp,
+ bool huge_page_disallowed, int *req_level)
{
struct kvm_memory_slot *slot;
struct kvm_lpage_info *linfo;
kvm_pfn_t mask;
int level;
+ *req_level = PG_LEVEL_4K;
+
if (unlikely(max_level == PG_LEVEL_4K))
return PG_LEVEL_4K;
if (level == PG_LEVEL_4K)
return level;
- level = min(level, max_level);
+ *req_level = level = min(level, max_level);
+
+ /*
+ * Enforce the iTLB multihit workaround after capturing the requested
+ * level, which will be used to do precise, accurate accounting.
+ */
+ if (huge_page_disallowed)
+ return PG_LEVEL_4K;
/*
* mmu_notifier_retry() was successful and mmu_lock is held, so
u64 spte = *it.sptep;
if (it.level == level && level > PG_LEVEL_4K &&
- is_nx_huge_page_enabled() &&
is_shadow_present_pte(spte) &&
!is_large_pte(spte)) {
/*
}
}
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
int map_writable, int max_level, kvm_pfn_t pfn,
- bool prefault, bool account_disallowed_nx_lpage)
+ bool prefault, bool is_tdp)
{
+ bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
+ bool write = error_code & PFERR_WRITE_MASK;
+ bool exec = error_code & PFERR_FETCH_MASK;
+ bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
- int level, ret;
+ int level, req_level, ret;
gfn_t gfn = gpa >> PAGE_SHIFT;
gfn_t base_gfn = gfn;
if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
return RET_PF_RETRY;
- level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn);
+ level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
+ huge_page_disallowed, &req_level);
trace_kvm_mmu_spte_requested(gpa, level, pfn);
for_each_shadow_entry(vcpu, gpa, it) {
* We cannot overwrite existing page tables with an NX
* large page, as the leaf could be executable.
*/
- disallowed_hugepage_adjust(it, gfn, &pfn, &level);
+ if (nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(it, gfn, &pfn, &level);
base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
if (it.level == level)
it.level - 1, true, ACC_ALL);
link_shadow_page(vcpu, it.sptep, sp);
- if (account_disallowed_nx_lpage)
+ if (is_tdp && huge_page_disallowed &&
+ req_level >= it.level)
account_huge_nx_page(vcpu->kvm, sp);
}
}
ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
write, level, base_gfn, pfn, prefault,
map_writable);
+ if (ret == RET_PF_SPURIOUS)
+ return ret;
+
direct_pte_prefetch(vcpu, it.sptep);
++vcpu->stat.pf_fixed;
return ret;
}
/*
- * Return value:
- * - true: let the vcpu to access on the same address again.
- * - false: let the real page fault path to fix it.
+ * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
*/
-static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- u32 error_code)
+static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ u32 error_code)
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
- bool fault_handled = false;
+ int ret = RET_PF_INVALID;
u64 spte = 0ull;
uint retry_count = 0;
if (!page_fault_can_be_fast(error_code))
- return false;
+ return ret;
walk_shadow_page_lockless_begin(vcpu);
* they are always ACC_ALL.
*/
if (is_access_allowed(error_code, spte)) {
- fault_handled = true;
+ ret = RET_PF_SPURIOUS;
break;
}
* since the gfn is not stable for indirect shadow page. See
* Documentation/virt/kvm/locking.rst to get more detail.
*/
- fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
- iterator.sptep, spte,
- new_spte);
- if (fault_handled)
+ if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte,
+ new_spte)) {
+ ret = RET_PF_FIXED;
break;
+ }
if (++retry_count > 4) {
printk_once(KERN_WARNING
} while (true);
trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
- spte, fault_handled);
+ spte, ret);
walk_shadow_page_lockless_end(vcpu);
- return fault_handled;
+ return ret;
}
static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
ulong roots_to_free)
{
+ struct kvm *kvm = vcpu->kvm;
int i;
LIST_HEAD(invalid_list);
bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
return;
}
- spin_lock(&vcpu->kvm->mmu_lock);
+ spin_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
- mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
+ mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
&invalid_list);
if (free_active_root) {
if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
(mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
- mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
- &invalid_list);
+ mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list);
} else {
for (i = 0; i < 4; ++i)
if (mmu->pae_root[i] != 0)
- mmu_free_root_page(vcpu->kvm,
+ mmu_free_root_page(kvm,
&mmu->pae_root[i],
&invalid_list);
mmu->root_hpa = INVALID_PAGE;
mmu->root_pgd = 0;
}
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ spin_unlock(&kvm->mmu_lock);
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
bool prefault, int max_level, bool is_tdp)
{
bool write = error_code & PFERR_WRITE_MASK;
- bool exec = error_code & PFERR_FETCH_MASK;
- bool lpage_disallowed = exec && is_nx_huge_page_enabled();
bool map_writable;
gfn_t gfn = gpa >> PAGE_SHIFT;
if (page_fault_handle_page_track(vcpu, error_code, gfn))
return RET_PF_EMULATE;
- if (fast_page_fault(vcpu, gpa, error_code))
- return RET_PF_RETRY;
+ r = fast_page_fault(vcpu, gpa, error_code);
+ if (r != RET_PF_INVALID)
+ return r;
r = mmu_topup_memory_caches(vcpu, false);
if (r)
return r;
- if (lpage_disallowed)
- max_level = PG_LEVEL_4K;
-
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
r = make_mmu_pages_available(vcpu);
if (r)
goto out_unlock;
- r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn,
- prefault, is_tdp && lpage_disallowed);
+ r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn,
+ prefault, is_tdp);
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
entry = *spte;
- mmu_page_zap_pte(vcpu->kvm, sp, spte);
+ mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
if (gentry &&
!((sp->role.word ^ base_role) & ~role_ign.word) &&
rmap_can_add(vcpu))
if (r == RET_PF_INVALID) {
r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
lower_32_bits(error_code), false);
- WARN_ON(r == RET_PF_INVALID);
+ if (WARN_ON_ONCE(r == RET_PF_INVALID))
+ return -EIO;
}
- if (r == RET_PF_RETRY)
- return 1;
if (r < 0)
return r;
+ if (r != RET_PF_EMULATE)
+ return 1;
/*
* Before emulating the instruction, check if the error code
if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
emulate:
- /*
- * On AMD platforms, under certain conditions insn_len may be zero on #NPF.
- * This can happen if a guest gets a page-fault on data access but the HW
- * table walker is not able to read the instruction page (e.g instruction
- * page is not present in memory). In those cases we simply restart the
- * guest, with the exception of AMD Erratum 1096 which is unrecoverable.
- */
- if (unlikely(insn && !insn_len)) {
- if (!kvm_x86_ops.need_emulation_on_page_fault(vcpu))
- return 1;
- }
-
return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
insn_len);
}
free_page((unsigned long)mmu->lm_root);
}
-static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
{
struct page *page;
int i;
+ mmu->root_hpa = INVALID_PAGE;
+ mmu->root_pgd = 0;
+ mmu->translate_gpa = translate_gpa;
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+
/*
* When using PAE paging, the four PDPTEs are treated as 'root' pages,
* while the PDP table is a per-vCPU construct that's allocated at MMU
int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
- uint i;
int ret;
vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
- vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
- vcpu->arch.root_mmu.root_pgd = 0;
- vcpu->arch.root_mmu.translate_gpa = translate_gpa;
- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
-
- vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
- vcpu->arch.guest_mmu.root_pgd = 0;
- vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
-
vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
- ret = alloc_mmu_pages(vcpu, &vcpu->arch.guest_mmu);
+ ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
if (ret)
return ret;
- ret = alloc_mmu_pages(vcpu, &vcpu->arch.root_mmu);
+ ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
if (ret)
goto fail_allocate_root;
ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
- while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
+ for ( ; to_zap; --to_zap) {
+ if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
+ break;
+
/*
* We use a separate list instead of just using active_mmu_pages
* because the number of lpage_disallowed pages is expected to
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
WARN_ON_ONCE(sp->lpage_disallowed);
- if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- if (to_zap)
- cond_resched_lock(&kvm->mmu_lock);
+ cond_resched_lock(&kvm->mmu_lock);
}
}
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, rcu_idx);
__entry->access)
);
-#define __spte_satisfied(__spte) \
- (__entry->retry && is_writable_pte(__entry->__spte))
-
TRACE_EVENT(
fast_page_fault,
TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code,
- u64 *sptep, u64 old_spte, bool retry),
- TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry),
+ u64 *sptep, u64 old_spte, int ret),
+ TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, ret),
TP_STRUCT__entry(
__field(int, vcpu_id)
__field(u64 *, sptep)
__field(u64, old_spte)
__field(u64, new_spte)
- __field(bool, retry)
+ __field(int, ret)
),
TP_fast_assign(
__entry->sptep = sptep;
__entry->old_spte = old_spte;
__entry->new_spte = *sptep;
- __entry->retry = retry;
+ __entry->ret = ret;
),
TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx"
__entry->cr2_or_gpa, __print_flags(__entry->error_code, "|",
kvm_mmu_trace_pferr_flags), __entry->sptep,
__entry->old_spte, __entry->new_spte,
- __spte_satisfied(old_spte), __spte_satisfied(new_spte)
+ __entry->ret == RET_PF_SPURIOUS, __entry->ret == RET_PF_FIXED
)
);
* we call mmu_set_spte() with host_writable = true because
* pte_prefetch_gfn_to_pfn always gets a writable pfn.
*/
- mmu_set_spte(vcpu, spte, pte_access, 0, PG_LEVEL_4K, gfn, pfn,
+ mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfn,
true, true);
kvm_release_pfn_clean(pfn);
* emulate this operation, return 1 to indicate this case.
*/
static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
- struct guest_walker *gw,
- int write_fault, int max_level,
- kvm_pfn_t pfn, bool map_writable, bool prefault,
- bool lpage_disallowed)
+ struct guest_walker *gw, u32 error_code,
+ int max_level, kvm_pfn_t pfn, bool map_writable,
+ bool prefault)
{
+ bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
+ bool write_fault = error_code & PFERR_WRITE_MASK;
+ bool exec = error_code & PFERR_FETCH_MASK;
+ bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
unsigned direct_access, access = gw->pt_access;
- int top_level, hlevel, ret;
+ int top_level, level, req_level, ret;
gfn_t base_gfn = gw->gfn;
direct_access = gw->pte_access;
link_shadow_page(vcpu, it.sptep, sp);
}
- hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn);
+ level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn,
+ huge_page_disallowed, &req_level);
trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
* We cannot overwrite existing page tables with an NX
* large page, as the leaf could be executable.
*/
- disallowed_hugepage_adjust(it, gw->gfn, &pfn, &hlevel);
+ if (nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(it, gw->gfn, &pfn, &level);
base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
- if (it.level == hlevel)
+ if (it.level == level)
break;
validate_direct_spte(vcpu, it.sptep, direct_access);
sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
it.level - 1, true, direct_access);
link_shadow_page(vcpu, it.sptep, sp);
- if (lpage_disallowed)
+ if (huge_page_disallowed && req_level >= it.level)
account_huge_nx_page(vcpu->kvm, sp);
}
}
ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
it.level, base_gfn, pfn, prefault, map_writable);
+ if (ret == RET_PF_SPURIOUS)
+ return ret;
+
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
++vcpu->stat.pf_fixed;
return ret;
*/
static bool
FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
- struct guest_walker *walker, int user_fault,
+ struct guest_walker *walker, bool user_fault,
bool *write_fault_to_shadow_pgtable)
{
int level;
static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
bool prefault)
{
- int write_fault = error_code & PFERR_WRITE_MASK;
- int user_fault = error_code & PFERR_USER_MASK;
+ bool write_fault = error_code & PFERR_WRITE_MASK;
+ bool user_fault = error_code & PFERR_USER_MASK;
struct guest_walker walker;
int r;
kvm_pfn_t pfn;
unsigned long mmu_seq;
bool map_writable, is_self_change_mapping;
- bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
- is_nx_huge_page_enabled();
int max_level;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
&walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
- if (lpage_disallowed || is_self_change_mapping)
+ if (is_self_change_mapping)
max_level = PG_LEVEL_4K;
else
max_level = walker.level;
r = make_mmu_pages_available(vcpu);
if (r)
goto out_unlock;
- r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn,
- map_writable, prefault, lpage_disallowed);
+ r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, pfn,
+ map_writable, prefault);
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
out_unlock:
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
+ u64 old_spte;
int level;
u64 *sptep;
sptep = iterator.sptep;
sp = sptep_to_sp(sptep);
- if (is_last_spte(*sptep, level)) {
+ old_spte = *sptep;
+ if (is_last_spte(old_spte, level)) {
pt_element_t gpte;
gpa_t pte_gpa;
pte_gpa = FNAME(get_level1_sp_gpa)(sp);
pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
- if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
+ mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL);
+ if (is_shadow_present_pte(old_spte))
kvm_flush_remote_tlbs_with_address(vcpu->kvm,
sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
return 0;
/* Allocating physical APIC ID table (4KB) */
- p_page = alloc_page(GFP_KERNEL_ACCOUNT);
+ p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!p_page)
goto free_avic;
kvm_svm->avic_physical_id_table_page = p_page;
- clear_page(page_address(p_page));
/* Allocating logical APIC ID table (4KB) */
- l_page = alloc_page(GFP_KERNEL_ACCOUNT);
+ l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!l_page)
goto free_avic;
kvm_svm->avic_logical_id_table_page = l_page;
- clear_page(page_address(l_page));
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
again:
void recalc_intercepts(struct vcpu_svm *svm)
{
struct vmcb_control_area *c, *h, *g;
+ unsigned int i;
vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
h = &svm->nested.hsave->control;
g = &svm->nested.ctl;
- svm->nested.host_intercept_exceptions = h->intercept_exceptions;
-
- c->intercept_cr = h->intercept_cr;
- c->intercept_dr = h->intercept_dr;
- c->intercept_exceptions = h->intercept_exceptions;
- c->intercept = h->intercept;
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ c->intercepts[i] = h->intercepts[i];
if (g->int_ctl & V_INTR_MASKING_MASK) {
/* We only want the cr8 intercept bits of L1 */
- c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ);
- c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE);
+ vmcb_clr_intercept(c, INTERCEPT_CR8_READ);
+ vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
/*
* Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
* affect any interrupt we may want to inject; therefore,
* interrupt window vmexits are irrelevant to L0.
*/
- c->intercept &= ~(1ULL << INTERCEPT_VINTR);
+ vmcb_clr_intercept(c, INTERCEPT_VINTR);
}
/* We don't want to see VMMCALLs from a nested guest */
- c->intercept &= ~(1ULL << INTERCEPT_VMMCALL);
+ vmcb_clr_intercept(c, INTERCEPT_VMMCALL);
- c->intercept_cr |= g->intercept_cr;
- c->intercept_dr |= g->intercept_dr;
- c->intercept_exceptions |= g->intercept_exceptions;
- c->intercept |= g->intercept;
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ c->intercepts[i] |= g->intercepts[i];
}
static void copy_vmcb_control_area(struct vmcb_control_area *dst,
struct vmcb_control_area *from)
{
- dst->intercept_cr = from->intercept_cr;
- dst->intercept_dr = from->intercept_dr;
- dst->intercept_exceptions = from->intercept_exceptions;
- dst->intercept = from->intercept;
+ unsigned int i;
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ dst->intercepts[i] = from->intercepts[i];
+
dst->iopm_base_pa = from->iopm_base_pa;
dst->msrpm_base_pa = from->msrpm_base_pa;
dst->tsc_offset = from->tsc_offset;
*/
int i;
- if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+ if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return true;
for (i = 0; i < MSRPM_OFFSETS; i++) {
return true;
}
+static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ if (!nested_svm_vmrun_msrpm(svm)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return false;
+ }
+
+ return true;
+}
+
static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
{
- if ((control->intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
+ if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0)
return false;
if (control->asid == 0)
return true;
}
-static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb)
+static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
{
- bool nested_vmcb_lma;
- if ((vmcb->save.efer & EFER_SVME) == 0)
+ bool vmcb12_lma;
+
+ if ((vmcb12->save.efer & EFER_SVME) == 0)
return false;
- if (((vmcb->save.cr0 & X86_CR0_CD) == 0) &&
- (vmcb->save.cr0 & X86_CR0_NW))
+ if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW))
return false;
- if (!kvm_dr6_valid(vmcb->save.dr6) || !kvm_dr7_valid(vmcb->save.dr7))
+ if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7))
return false;
- nested_vmcb_lma =
- (vmcb->save.efer & EFER_LME) &&
- (vmcb->save.cr0 & X86_CR0_PG);
+ vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG);
- if (!nested_vmcb_lma) {
- if (vmcb->save.cr4 & X86_CR4_PAE) {
- if (vmcb->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK)
+ if (!vmcb12_lma) {
+ if (vmcb12->save.cr4 & X86_CR4_PAE) {
+ if (vmcb12->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK)
return false;
} else {
- if (vmcb->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK)
+ if (vmcb12->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK)
return false;
}
} else {
- if (!(vmcb->save.cr4 & X86_CR4_PAE) ||
- !(vmcb->save.cr0 & X86_CR0_PE) ||
- (vmcb->save.cr3 & MSR_CR3_LONG_RESERVED_MASK))
+ if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
+ !(vmcb12->save.cr0 & X86_CR0_PE) ||
+ (vmcb12->save.cr3 & MSR_CR3_LONG_MBZ_MASK))
return false;
}
- if (kvm_valid_cr4(&svm->vcpu, vmcb->save.cr4))
+ if (kvm_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
return false;
- return nested_vmcb_check_controls(&vmcb->control);
+ return nested_vmcb_check_controls(&vmcb12->control);
}
static void load_nested_vmcb_control(struct vcpu_svm *svm,
* EXIT_INT_INFO.
*/
static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
- struct vmcb *nested_vmcb)
+ struct vmcb *vmcb12)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
u32 exit_int_info = 0;
if (vcpu->arch.exception.has_error_code) {
exit_int_info |= SVM_EVTINJ_VALID_ERR;
- nested_vmcb->control.exit_int_info_err =
+ vmcb12->control.exit_int_info_err =
vcpu->arch.exception.error_code;
}
exit_int_info |= SVM_EVTINJ_TYPE_INTR;
}
- nested_vmcb->control.exit_int_info = exit_int_info;
+ vmcb12->control.exit_int_info = exit_int_info;
}
static inline bool nested_npt_enabled(struct vcpu_svm *svm)
return 0;
}
-static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb)
+static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
{
/* Load the nested guest state */
- svm->vmcb->save.es = nested_vmcb->save.es;
- svm->vmcb->save.cs = nested_vmcb->save.cs;
- svm->vmcb->save.ss = nested_vmcb->save.ss;
- svm->vmcb->save.ds = nested_vmcb->save.ds;
- svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
- svm->vmcb->save.idtr = nested_vmcb->save.idtr;
- kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
- svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
- svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
- svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
- svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
- kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax);
- kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp);
- kvm_rip_write(&svm->vcpu, nested_vmcb->save.rip);
+ svm->vmcb->save.es = vmcb12->save.es;
+ svm->vmcb->save.cs = vmcb12->save.cs;
+ svm->vmcb->save.ss = vmcb12->save.ss;
+ svm->vmcb->save.ds = vmcb12->save.ds;
+ svm->vmcb->save.gdtr = vmcb12->save.gdtr;
+ svm->vmcb->save.idtr = vmcb12->save.idtr;
+ kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags);
+ svm_set_efer(&svm->vcpu, vmcb12->save.efer);
+ svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
+ svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
+ svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+ kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
+ kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
+ kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
/* In case we don't even reach vcpu_run, the fields are not updated */
- svm->vmcb->save.rax = nested_vmcb->save.rax;
- svm->vmcb->save.rsp = nested_vmcb->save.rsp;
- svm->vmcb->save.rip = nested_vmcb->save.rip;
- svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
- svm->vcpu.arch.dr6 = nested_vmcb->save.dr6;
- svm->vmcb->save.cpl = nested_vmcb->save.cpl;
+ svm->vmcb->save.rax = vmcb12->save.rax;
+ svm->vmcb->save.rsp = vmcb12->save.rsp;
+ svm->vmcb->save.rip = vmcb12->save.rip;
+ svm->vmcb->save.dr7 = vmcb12->save.dr7;
+ svm->vcpu.arch.dr6 = vmcb12->save.dr6;
+ svm->vmcb->save.cpl = vmcb12->save.cpl;
}
static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
vmcb_mark_all_dirty(svm->vmcb);
}
-int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
- struct vmcb *nested_vmcb)
+int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
+ struct vmcb *vmcb12)
{
int ret;
- svm->nested.vmcb = vmcb_gpa;
- load_nested_vmcb_control(svm, &nested_vmcb->control);
- nested_prepare_vmcb_save(svm, nested_vmcb);
+ svm->nested.vmcb12_gpa = vmcb12_gpa;
+ load_nested_vmcb_control(svm, &vmcb12->control);
+ nested_prepare_vmcb_save(svm, vmcb12);
nested_prepare_vmcb_control(svm);
- ret = nested_svm_load_cr3(&svm->vcpu, nested_vmcb->save.cr3,
+ ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
nested_npt_enabled(svm));
if (ret)
return ret;
int nested_svm_vmrun(struct vcpu_svm *svm)
{
int ret;
- struct vmcb *nested_vmcb;
+ struct vmcb *vmcb12;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
- u64 vmcb_gpa;
+ u64 vmcb12_gpa;
if (is_smm(&svm->vcpu)) {
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
}
- vmcb_gpa = svm->vmcb->save.rax;
- ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map);
+ vmcb12_gpa = svm->vmcb->save.rax;
+ ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map);
if (ret == -EINVAL) {
kvm_inject_gp(&svm->vcpu, 0);
return 1;
ret = kvm_skip_emulated_instruction(&svm->vcpu);
- nested_vmcb = map.hva;
+ vmcb12 = map.hva;
- if (!nested_vmcb_checks(svm, nested_vmcb)) {
- nested_vmcb->control.exit_code = SVM_EXIT_ERR;
- nested_vmcb->control.exit_code_hi = 0;
- nested_vmcb->control.exit_info_1 = 0;
- nested_vmcb->control.exit_info_2 = 0;
+ if (!nested_vmcb_checks(svm, vmcb12)) {
+ vmcb12->control.exit_code = SVM_EXIT_ERR;
+ vmcb12->control.exit_code_hi = 0;
+ vmcb12->control.exit_info_1 = 0;
+ vmcb12->control.exit_info_2 = 0;
goto out;
}
- trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
- nested_vmcb->save.rip,
- nested_vmcb->control.int_ctl,
- nested_vmcb->control.event_inj,
- nested_vmcb->control.nested_ctl);
+ trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
+ vmcb12->save.rip,
+ vmcb12->control.int_ctl,
+ vmcb12->control.event_inj,
+ vmcb12->control.nested_ctl);
- trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
- nested_vmcb->control.intercept_cr >> 16,
- nested_vmcb->control.intercept_exceptions,
- nested_vmcb->control.intercept);
+ trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
+ vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
+ vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
+ vmcb12->control.intercepts[INTERCEPT_WORD3],
+ vmcb12->control.intercepts[INTERCEPT_WORD4],
+ vmcb12->control.intercepts[INTERCEPT_WORD5]);
/* Clear internal status */
kvm_clear_exception_queue(&svm->vcpu);
svm->nested.nested_run_pending = 1;
- if (enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb))
+ if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12))
goto out_exit_err;
if (nested_svm_vmrun_msrpm(svm))
int nested_svm_vmexit(struct vcpu_svm *svm)
{
int rc;
- struct vmcb *nested_vmcb;
+ struct vmcb *vmcb12;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
- rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map);
+ rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
if (rc) {
if (rc == -EINVAL)
kvm_inject_gp(&svm->vcpu, 0);
return 1;
}
- nested_vmcb = map.hva;
+ vmcb12 = map.hva;
/* Exit Guest-Mode */
leave_guest_mode(&svm->vcpu);
- svm->nested.vmcb = 0;
+ svm->nested.vmcb12_gpa = 0;
WARN_ON_ONCE(svm->nested.nested_run_pending);
/* in case we halted in L2 */
svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
/* Give the current vmcb to the guest */
- svm_set_gif(svm, false);
- nested_vmcb->save.es = vmcb->save.es;
- nested_vmcb->save.cs = vmcb->save.cs;
- nested_vmcb->save.ss = vmcb->save.ss;
- nested_vmcb->save.ds = vmcb->save.ds;
- nested_vmcb->save.gdtr = vmcb->save.gdtr;
- nested_vmcb->save.idtr = vmcb->save.idtr;
- nested_vmcb->save.efer = svm->vcpu.arch.efer;
- nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
- nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
- nested_vmcb->save.cr2 = vmcb->save.cr2;
- nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
- nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
- nested_vmcb->save.rip = kvm_rip_read(&svm->vcpu);
- nested_vmcb->save.rsp = kvm_rsp_read(&svm->vcpu);
- nested_vmcb->save.rax = kvm_rax_read(&svm->vcpu);
- nested_vmcb->save.dr7 = vmcb->save.dr7;
- nested_vmcb->save.dr6 = svm->vcpu.arch.dr6;
- nested_vmcb->save.cpl = vmcb->save.cpl;
-
- nested_vmcb->control.int_state = vmcb->control.int_state;
- nested_vmcb->control.exit_code = vmcb->control.exit_code;
- nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi;
- nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1;
- nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
-
- if (nested_vmcb->control.exit_code != SVM_EXIT_ERR)
- nested_vmcb_save_pending_event(svm, nested_vmcb);
+ vmcb12->save.es = vmcb->save.es;
+ vmcb12->save.cs = vmcb->save.cs;
+ vmcb12->save.ss = vmcb->save.ss;
+ vmcb12->save.ds = vmcb->save.ds;
+ vmcb12->save.gdtr = vmcb->save.gdtr;
+ vmcb12->save.idtr = vmcb->save.idtr;
+ vmcb12->save.efer = svm->vcpu.arch.efer;
+ vmcb12->save.cr0 = kvm_read_cr0(&svm->vcpu);
+ vmcb12->save.cr3 = kvm_read_cr3(&svm->vcpu);
+ vmcb12->save.cr2 = vmcb->save.cr2;
+ vmcb12->save.cr4 = svm->vcpu.arch.cr4;
+ vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu);
+ vmcb12->save.rip = kvm_rip_read(&svm->vcpu);
+ vmcb12->save.rsp = kvm_rsp_read(&svm->vcpu);
+ vmcb12->save.rax = kvm_rax_read(&svm->vcpu);
+ vmcb12->save.dr7 = vmcb->save.dr7;
+ vmcb12->save.dr6 = svm->vcpu.arch.dr6;
+ vmcb12->save.cpl = vmcb->save.cpl;
+
+ vmcb12->control.int_state = vmcb->control.int_state;
+ vmcb12->control.exit_code = vmcb->control.exit_code;
+ vmcb12->control.exit_code_hi = vmcb->control.exit_code_hi;
+ vmcb12->control.exit_info_1 = vmcb->control.exit_info_1;
+ vmcb12->control.exit_info_2 = vmcb->control.exit_info_2;
+
+ if (vmcb12->control.exit_code != SVM_EXIT_ERR)
+ nested_vmcb_save_pending_event(svm, vmcb12);
if (svm->nrips_enabled)
- nested_vmcb->control.next_rip = vmcb->control.next_rip;
+ vmcb12->control.next_rip = vmcb->control.next_rip;
- nested_vmcb->control.int_ctl = svm->nested.ctl.int_ctl;
- nested_vmcb->control.tlb_ctl = svm->nested.ctl.tlb_ctl;
- nested_vmcb->control.event_inj = svm->nested.ctl.event_inj;
- nested_vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err;
+ vmcb12->control.int_ctl = svm->nested.ctl.int_ctl;
+ vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl;
+ vmcb12->control.event_inj = svm->nested.ctl.event_inj;
+ vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
- nested_vmcb->control.pause_filter_count =
+ vmcb12->control.pause_filter_count =
svm->vmcb->control.pause_filter_count;
- nested_vmcb->control.pause_filter_thresh =
+ vmcb12->control.pause_filter_thresh =
svm->vmcb->control.pause_filter_thresh;
/* Restore the original control entries */
copy_vmcb_control_area(&vmcb->control, &hsave->control);
+ /* On vmexit the GIF is set to false */
+ svm_set_gif(svm, false);
+
svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
svm->vcpu.arch.l1_tsc_offset;
vmcb_mark_all_dirty(svm->vmcb);
- trace_kvm_nested_vmexit_inject(nested_vmcb->control.exit_code,
- nested_vmcb->control.exit_info_1,
- nested_vmcb->control.exit_info_2,
- nested_vmcb->control.exit_int_info,
- nested_vmcb->control.exit_int_info_err,
+ trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
+ vmcb12->control.exit_info_1,
+ vmcb12->control.exit_info_2,
+ vmcb12->control.exit_int_info,
+ vmcb12->control.exit_int_info_err,
KVM_ISA_SVM);
kvm_vcpu_unmap(&svm->vcpu, &map, true);
copy_vmcb_control_area(&vmcb->control, &hsave->control);
nested_svm_uninit_mmu_context(&svm->vcpu);
}
+
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
}
static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
u32 offset, msr, value;
int write, mask;
- if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+ if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return NESTED_EXIT_HOST;
msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
u8 start_bit;
u64 gpa;
- if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
+ if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT)))
return NESTED_EXIT_HOST;
port = svm->vmcb->control.exit_info_1 >> 16;
vmexit = nested_svm_intercept_ioio(svm);
break;
case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
- u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
- if (svm->nested.ctl.intercept_cr & bit)
+ if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
- u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
- if (svm->nested.ctl.intercept_dr & bit)
+ if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
vmexit = NESTED_EXIT_DONE;
break;
}
break;
}
default: {
- u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
- if (svm->nested.ctl.intercept & exit_bits)
+ if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
vmexit = NESTED_EXIT_DONE;
}
}
{
unsigned int nr = svm->vcpu.arch.exception.nr;
- return (svm->nested.ctl.intercept_exceptions & (1 << nr));
+ return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(nr));
}
static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
static inline bool nested_exit_on_init(struct vcpu_svm *svm)
{
- return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INIT));
+ return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
}
static void nested_svm_init(struct vcpu_svm *svm)
case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
- if (get_host_vmcb(svm)->control.intercept_exceptions & excp_bits)
+ if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] &
+ excp_bits)
return NESTED_EXIT_HOST;
else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
svm->vcpu.arch.apf.host_apf_flags)
/* First fill in the header and copy it out. */
if (is_guest_mode(vcpu)) {
- kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb;
+ kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa;
kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE;
kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb __user *user_vmcb = (struct vmcb __user *)
&user_kvm_nested_state->data.svm[0];
- struct vmcb_control_area ctl;
- struct vmcb_save_area save;
+ struct vmcb_control_area *ctl;
+ struct vmcb_save_area *save;
+ int ret;
u32 cr0;
+ BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) >
+ KVM_STATE_NESTED_SVM_VMCB_SIZE);
+
if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM)
return -EINVAL;
if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
svm_leave_nested(svm);
- goto out_set_gif;
+ svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
+ return 0;
}
if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa))
return -EINVAL;
if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE)
return -EINVAL;
- if (copy_from_user(&ctl, &user_vmcb->control, sizeof(ctl)))
- return -EFAULT;
- if (copy_from_user(&save, &user_vmcb->save, sizeof(save)))
- return -EFAULT;
- if (!nested_vmcb_check_controls(&ctl))
- return -EINVAL;
+ ret = -ENOMEM;
+ ctl = kzalloc(sizeof(*ctl), GFP_KERNEL);
+ save = kzalloc(sizeof(*save), GFP_KERNEL);
+ if (!ctl || !save)
+ goto out_free;
+
+ ret = -EFAULT;
+ if (copy_from_user(ctl, &user_vmcb->control, sizeof(*ctl)))
+ goto out_free;
+ if (copy_from_user(save, &user_vmcb->save, sizeof(*save)))
+ goto out_free;
+
+ ret = -EINVAL;
+ if (!nested_vmcb_check_controls(ctl))
+ goto out_free;
/*
* Processor state contains L2 state. Check that it is
*/
cr0 = kvm_read_cr0(vcpu);
if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
- return -EINVAL;
+ goto out_free;
/*
* Validate host state saved from before VMRUN (see
* nested_svm_check_permissions).
* TODO: validate reserved bits for all saved state.
*/
- if (!(save.cr0 & X86_CR0_PG))
- return -EINVAL;
+ if (!(save->cr0 & X86_CR0_PG))
+ goto out_free;
/*
* All checks done, we can enter guest mode. L1 control fields
* contains saved L1 state.
*/
copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
- hsave->save = save;
+ hsave->save = *save;
- svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
- load_nested_vmcb_control(svm, &ctl);
+ svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
+ load_nested_vmcb_control(svm, ctl);
nested_prepare_vmcb_control(svm);
-out_set_gif:
- svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
- return 0;
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+ ret = 0;
+out_free:
+ kfree(save);
+ kfree(ctl);
+
+ return ret;
}
struct kvm_x86_nested_ops svm_nested_ops = {
.check_events = svm_check_nested_events,
+ .get_nested_state_pages = svm_get_nested_state_pages,
.get_state = svm_get_nested_state,
.set_state = svm_set_nested_state,
};
}
/*
- * The LAUNCH_UPDATE command will perform in-place encryption of the
- * memory content (i.e it will write the same memory region with C=1).
- * It's possible that the cache may contain the data with C=0, i.e.,
- * unencrypted so invalidate it first.
+ * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in
+ * place; the cache may contain the data that was written unencrypted.
*/
sev_clflush_pages(inpages, npages);
}
/*
- * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the
- * memory content (i.e it will write the same memory region with C=1).
- * It's possible that the cache may contain the data with C=0, i.e.,
- * unencrypted so invalidate it first.
+ * Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify
+ * the pages; flush the destination too so that future accesses do not
+ * see stale data.
*/
sev_clflush_pages(src_p, 1);
sev_clflush_pages(dst_p, 1);
struct kvm_sev_launch_secret params;
struct page **pages;
void *blob, *hdr;
- unsigned long n;
+ unsigned long n, i;
int ret, offset;
if (!sev_guest(kvm))
if (IS_ERR(pages))
return PTR_ERR(pages);
+ /*
+ * Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in
+ * place; the cache may contain the data that was written unencrypted.
+ */
+ sev_clflush_pages(pages, n);
+
/*
* The secret must be copied into contiguous memory region, lets verify
* that userspace memory pages are contiguous before we issue command.
e_free:
kfree(data);
e_unpin_memory:
+ /* content of memory is updated, mark pages dirty */
+ for (i = 0; i < n; i++) {
+ set_page_dirty_lock(pages[i]);
+ mark_page_accessed(pages[i]);
+ }
sev_unpin_memory(kvm, pages, n);
return ret;
}
list_for_each_safe(pos, q, head) {
__unregister_enc_region_locked(kvm,
list_entry(pos, struct enc_region, list));
+ cond_resched();
}
}
static const struct svm_direct_access_msrs {
u32 index; /* Index of the MSR */
bool always; /* True if intercept is always on */
-} direct_access_msrs[] = {
+} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
{ .index = MSR_STAR, .always = true },
{ .index = MSR_IA32_SYSENTER_CS, .always = true },
#ifdef CONFIG_X86_64
}
-static bool valid_msr_intercept(u32 index)
+static int direct_access_msr_slot(u32 msr)
{
- int i;
+ u32 i;
for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
- if (direct_access_msrs[i].index == index)
- return true;
+ if (direct_access_msrs[i].index == msr)
+ return i;
- return false;
+ return -ENOENT;
}
-static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
+static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
+ int write)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int slot = direct_access_msr_slot(msr);
+
+ if (slot == -ENOENT)
+ return;
+
+ /* Set the shadow bitmaps to the desired intercept states */
+ if (read)
+ set_bit(slot, svm->shadow_msr_intercept.read);
+ else
+ clear_bit(slot, svm->shadow_msr_intercept.read);
+
+ if (write)
+ set_bit(slot, svm->shadow_msr_intercept.write);
+ else
+ clear_bit(slot, svm->shadow_msr_intercept.write);
+}
+
+static bool valid_msr_intercept(u32 index)
+{
+ return direct_access_msr_slot(index) != -ENOENT;
+}
+
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
{
u8 bit_write;
unsigned long tmp;
return !!test_bit(bit_write, &tmp);
}
-static void set_msr_interception(u32 *msrpm, unsigned msr,
- int read, int write)
+static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
+ u32 msr, int read, int write)
{
u8 bit_read, bit_write;
unsigned long tmp;
*/
WARN_ON(!valid_msr_intercept(msr));
+ /* Enforce non allowed MSRs to trap */
+ if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+ read = 0;
+
+ if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+ write = 0;
+
offset = svm_msrpm_offset(msr);
bit_read = 2 * (msr & 0x0f);
bit_write = 2 * (msr & 0x0f) + 1;
msrpm[offset] = tmp;
}
-static void svm_vcpu_init_msrpm(u32 *msrpm)
+static void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
+ int read, int write)
+{
+ set_shadow_msr_intercept(vcpu, msr, read, write);
+ set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
+}
+
+static u32 *svm_vcpu_alloc_msrpm(void)
{
- int i;
+ struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
+ u32 *msrpm;
+
+ if (!pages)
+ return NULL;
+ msrpm = page_address(pages);
memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+ return msrpm;
+}
+
+static void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
+{
+ int i;
+
for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
if (!direct_access_msrs[i].always)
continue;
+ set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
+ }
+}
+
+static void svm_vcpu_free_msrpm(u32 *msrpm)
+{
+ __free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER);
+}
+
+static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 i;
+
+ /*
+ * Set intercept permissions for all direct access MSRs again. They
+ * will automatically get filtered through the MSR filter, so we are
+ * back in sync after this.
+ */
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+ u32 msr = direct_access_msrs[i].index;
+ u32 read = test_bit(i, svm->shadow_msr_intercept.read);
+ u32 write = test_bit(i, svm->shadow_msr_intercept.write);
- set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
+ set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
}
}
}
}
-static void svm_enable_lbrv(struct vcpu_svm *svm)
+static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
{
- u32 *msrpm = svm->msrpm;
+ struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
- set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
}
-static void svm_disable_lbrv(struct vcpu_svm *svm)
+static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
{
- u32 *msrpm = svm->msrpm;
+ struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
- set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
- set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
- set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
- set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
}
void disable_nmi_singlestep(struct vcpu_svm *svm)
if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
boot_cpu_has(X86_FEATURE_AMD_SSBD))
kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+
+ /* Enable INVPCID feature */
+ kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
}
static __init int svm_hardware_setup(void)
return svm->vmcb->control.tsc_offset;
}
+static void svm_check_invpcid(struct vcpu_svm *svm)
+{
+ /*
+ * Intercept INVPCID instruction only if shadow page table is
+ * enabled. Interception is not required with nested page table
+ * enabled.
+ */
+ if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
+ if (!npt_enabled)
+ svm_set_intercept(svm, INTERCEPT_INVPCID);
+ else
+ svm_clr_intercept(svm, INTERCEPT_INVPCID);
+ }
+}
+
static void init_vmcb(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
svm->vcpu.arch.hflags = 0;
- set_cr_intercept(svm, INTERCEPT_CR0_READ);
- set_cr_intercept(svm, INTERCEPT_CR3_READ);
- set_cr_intercept(svm, INTERCEPT_CR4_READ);
- set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
- set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
- set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR0_READ);
+ svm_set_intercept(svm, INTERCEPT_CR3_READ);
+ svm_set_intercept(svm, INTERCEPT_CR4_READ);
+ svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
if (!kvm_vcpu_apicv_active(&svm->vcpu))
- set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
set_dr_intercepts(svm);
control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
svm_clr_intercept(svm, INTERCEPT_INVLPG);
clr_exception_intercept(svm, PF_VECTOR);
- clr_cr_intercept(svm, INTERCEPT_CR3_READ);
- clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
+ svm_clr_intercept(svm, INTERCEPT_CR3_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
save->g_pat = svm->vcpu.arch.pat;
save->cr3 = 0;
save->cr4 = 0;
}
svm->asid_generation = 0;
- svm->nested.vmcb = 0;
+ svm->nested.vmcb12_gpa = 0;
svm->vcpu.arch.hflags = 0;
if (!kvm_pause_in_guest(svm->vcpu.kvm)) {
svm_clr_intercept(svm, INTERCEPT_PAUSE);
}
+ svm_check_invpcid(svm);
+
if (kvm_vcpu_apicv_active(&svm->vcpu))
avic_init_vmcb(svm);
static int svm_create_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
- struct page *page;
- struct page *msrpm_pages;
+ struct page *vmcb_page;
struct page *hsave_page;
- struct page *nested_msrpm_pages;
int err;
BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
svm = to_svm(vcpu);
err = -ENOMEM;
- page = alloc_page(GFP_KERNEL_ACCOUNT);
- if (!page)
+ vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vmcb_page)
goto out;
- msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
- if (!msrpm_pages)
- goto free_page1;
-
- nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
- if (!nested_msrpm_pages)
- goto free_page2;
-
- hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
+ hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!hsave_page)
- goto free_page3;
+ goto error_free_vmcb_page;
err = avic_init_vcpu(svm);
if (err)
- goto free_page4;
+ goto error_free_hsave_page;
/* We initialize this flag to true to make sure that the is_running
* bit would be set the first time the vcpu is loaded.
svm->avic_is_running = true;
svm->nested.hsave = page_address(hsave_page);
- clear_page(svm->nested.hsave);
- svm->msrpm = page_address(msrpm_pages);
- svm_vcpu_init_msrpm(svm->msrpm);
+ svm->msrpm = svm_vcpu_alloc_msrpm();
+ if (!svm->msrpm)
+ goto error_free_hsave_page;
- svm->nested.msrpm = page_address(nested_msrpm_pages);
- svm_vcpu_init_msrpm(svm->nested.msrpm);
+ svm_vcpu_init_msrpm(vcpu, svm->msrpm);
- svm->vmcb = page_address(page);
- clear_page(svm->vmcb);
- svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
+ svm->nested.msrpm = svm_vcpu_alloc_msrpm();
+ if (!svm->nested.msrpm)
+ goto error_free_msrpm;
+
+ /* We only need the L1 pass-through MSR state, so leave vcpu as NULL */
+ svm_vcpu_init_msrpm(vcpu, svm->nested.msrpm);
+
+ svm->vmcb = page_address(vmcb_page);
+ svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT);
svm->asid_generation = 0;
init_vmcb(svm);
return 0;
-free_page4:
+error_free_msrpm:
+ svm_vcpu_free_msrpm(svm->msrpm);
+error_free_hsave_page:
__free_page(hsave_page);
-free_page3:
- __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
-free_page2:
- __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
-free_page1:
- __free_page(page);
+error_free_vmcb_page:
+ __free_page(vmcb_page);
out:
return err;
}
vmcb_mark_dirty(svm->vmcb, VMCB_CR);
if (gcr0 == *hcr0) {
- clr_cr_intercept(svm, INTERCEPT_CR0_READ);
- clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+ svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
} else {
- set_cr_intercept(svm, INTERCEPT_CR0_READ);
- set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR0_READ);
+ svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
}
}
{
unsigned long cr0 = svm->vcpu.arch.cr0;
bool ret = false;
- u64 intercept;
-
- intercept = svm->nested.ctl.intercept;
if (!is_guest_mode(&svm->vcpu) ||
- (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
+ (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
return false;
cr0 &= ~SVM_CR0_SELECTIVE_MASK;
if (cr >= 16) { /* mov to cr */
cr -= 16;
val = kvm_register_read(&svm->vcpu, reg);
+ trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
if (!check_selective_cr0_intercepted(svm, val))
return 1;
}
kvm_register_write(&svm->vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
}
return kvm_complete_insn_gp(&svm->vcpu, err);
}
* We update the L1 MSR bit as well since it will end up
* touching the MSR anyway now.
*/
- set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
break;
case MSR_IA32_PRED_CMD:
if (!msr->host_initiated &&
break;
wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
- set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
break;
case MSR_AMD64_VIRT_SPEC_CTRL:
if (!msr->host_initiated &&
svm->vmcb->save.dbgctl = data;
vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
if (data & (1ULL<<0))
- svm_enable_lbrv(svm);
+ svm_enable_lbrv(vcpu);
else
- svm_disable_lbrv(svm);
+ svm_disable_lbrv(vcpu);
break;
case MSR_VM_HSAVE_PA:
svm->nested.hsave_msr = data;
return nop_interception(svm);
}
+static int invpcid_interception(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ unsigned long type;
+ gva_t gva;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ /*
+ * For an INVPCID intercept:
+ * EXITINFO1 provides the linear address of the memory operand.
+ * EXITINFO2 provides the contents of the register operand.
+ */
+ type = svm->vmcb->control.exit_info_2;
+ gva = svm->vmcb->control.exit_info_1;
+
+ if (type > 3) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ return kvm_handle_invpcid(vcpu, type, gva);
+}
+
static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_READ_CR0] = cr_interception,
[SVM_EXIT_READ_CR3] = cr_interception,
[SVM_EXIT_MWAIT] = mwait_interception,
[SVM_EXIT_XSETBV] = xsetbv_interception,
[SVM_EXIT_RDPRU] = rdpru_interception,
+ [SVM_EXIT_INVPCID] = invpcid_interception,
[SVM_EXIT_NPF] = npf_interception,
[SVM_EXIT_RSM] = rsm_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
}
pr_err("VMCB Control Area:\n");
- pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
- pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
- pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
- pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
- pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
- pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
+ pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
+ pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
+ pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
+ pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
+ pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
+ pr_err("%-20s%08x %08x\n", "intercepts:",
+ control->intercepts[INTERCEPT_WORD3],
+ control->intercepts[INTERCEPT_WORD4]);
pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
pr_err("%-20s%d\n", "pause filter threshold:",
control->pause_filter_thresh);
"excp_to:", save->last_excp_to);
}
-static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
+ u32 *intr_info, u32 *error_code)
{
struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
*info1 = control->exit_info_1;
*info2 = control->exit_info_2;
+ *intr_info = control->exit_int_info;
+ if ((*intr_info & SVM_EXITINTINFO_VALID) &&
+ (*intr_info & SVM_EXITINTINFO_VALID_ERR))
+ *error_code = control->exit_int_info_err;
+ else
+ *error_code = 0;
}
static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
- if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
+ if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
vcpu->arch.cr0 = svm->vmcb->save.cr0;
if (npt_enabled)
vcpu->arch.cr3 = svm->vmcb->save.cr3;
- svm_complete_interrupts(svm);
-
if (is_guest_mode(vcpu)) {
int vmexit;
- trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
- svm->vmcb->control.exit_info_1,
- svm->vmcb->control.exit_info_2,
- svm->vmcb->control.exit_int_info,
- svm->vmcb->control.exit_int_info_err,
- KVM_ISA_SVM);
+ trace_kvm_nested_vmexit(exit_code, vcpu, KVM_ISA_SVM);
vmexit = nested_svm_exit_special(svm);
if (nested_svm_virtualize_tpr(vcpu))
return;
- clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+ svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
if (irr == -1)
return;
if (tpr >= irr)
- set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
}
bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
if (nested_svm_virtualize_tpr(vcpu))
return;
- if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
+ if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
kvm_set_cr8(vcpu, cr8);
}
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
- if (!is_guest_mode(vcpu) &&
- to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
+ if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
to_svm(vcpu)->vmcb->control.exit_info_1)
return handle_fastpath_set_msr_irqoff(vcpu);
static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
{
- fastpath_t exit_fastpath;
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
clgi();
kvm_load_guest_xsave_state(vcpu);
- if (lapic_in_kernel(vcpu) &&
- vcpu->arch.apic->lapic_timer.timer_advance_ns)
- kvm_wait_lapic_expire(vcpu);
+ kvm_wait_lapic_expire(vcpu);
/*
* If this vCPU has touched SPEC_CTRL, restore the guest's value if
stgi();
/* Any pending NMI will happen here */
- exit_fastpath = svm_exit_handlers_fastpath(vcpu);
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_after_interrupt(&svm->vcpu);
}
svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+ vmcb_mark_all_clean(svm->vmcb);
/* if exit due to PF check for async PF */
if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
SVM_EXIT_EXCP_BASE + MC_VECTOR))
svm_handle_mce(svm);
- vmcb_mark_all_clean(svm->vmcb);
- return exit_fastpath;
+ svm_complete_interrupts(svm);
+
+ if (is_guest_mode(vcpu))
+ return EXIT_FASTPATH_NONE;
+
+ return svm_exit_handlers_fastpath(vcpu);
}
static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root,
svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
+ /* Check again if INVPCID interception if required */
+ svm_check_invpcid(svm);
+
if (!kvm_vcpu_apicv_active(vcpu))
return;
break;
case SVM_EXIT_WRITE_CR0: {
unsigned long cr0, val;
- u64 intercept;
if (info->intercept == x86_intercept_cr_write)
icpt_info.exit_code += info->modrm_reg;
info->intercept == x86_intercept_clts)
break;
- intercept = svm->nested.ctl.intercept;
-
- if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
+ if (!(vmcb_is_intercept(&svm->nested.ctl,
+ INTERCEPT_SELECTIVE_CR0)))
break;
cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
/* FED8h - SVM Guest */
put_smstate(u64, smstate, 0x7ed8, 1);
/* FEE0h - SVM Guest VMCB Physical Address */
- put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb);
+ put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *nested_vmcb;
struct kvm_host_map map;
- u64 guest;
- u64 vmcb;
int ret = 0;
- guest = GET_SMSTATE(u64, smstate, 0x7ed8);
- vmcb = GET_SMSTATE(u64, smstate, 0x7ee0);
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
+ u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
+ u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8);
+ u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
- if (guest) {
- if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL)
- return 1;
- nested_vmcb = map.hva;
- ret = enter_svm_guest_mode(svm, vmcb, nested_vmcb);
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ if (guest) {
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+ return 1;
+
+ if (!(saved_efer & EFER_SVME))
+ return 1;
+
+ if (kvm_vcpu_map(&svm->vcpu,
+ gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
+ return 1;
+
+ ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva);
+ kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ }
}
return ret;
}
}
-static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
+static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
{
- unsigned long cr4 = kvm_read_cr4(vcpu);
- bool smep = cr4 & X86_CR4_SMEP;
- bool smap = cr4 & X86_CR4_SMAP;
- bool is_user = svm_get_cpl(vcpu) == 3;
-
- /*
- * If RIP is invalid, go ahead with emulation which will cause an
- * internal error exit.
- */
- if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
- return true;
+ bool smep, smap, is_user;
+ unsigned long cr4;
/*
* Detect and workaround Errata 1096 Fam_17h_00_0Fh.
* instruction pointer so we will not able to workaround it. Lets
* print the error and request to kill the guest.
*/
+ if (likely(!insn || insn_len))
+ return true;
+
+ /*
+ * If RIP is invalid, go ahead with emulation which will cause an
+ * internal error exit.
+ */
+ if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
+ return true;
+
+ cr4 = kvm_read_cr4(vcpu);
+ smep = cr4 & X86_CR4_SMEP;
+ smap = cr4 & X86_CR4_SMAP;
+ is_user = svm_get_cpl(vcpu) == 3;
if (smap && (!smep || is_user)) {
if (!sev_guest(vcpu->kvm))
return true;
* if an INIT signal is pending.
*/
return !gif_set(svm) ||
- (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT));
+ (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT));
}
static void svm_vm_destroy(struct kvm *kvm)
.mem_enc_reg_region = svm_register_enc_region,
.mem_enc_unreg_region = svm_unregister_enc_region,
- .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
+ .can_emulate_instruction = svm_can_emulate_instruction,
.apic_init_signal_blocked = svm_apic_init_signal_blocked,
+
+ .msr_filter_changed = svm_msr_filter_changed,
};
static struct kvm_x86_init_ops svm_init_ops __initdata = {
static int __init svm_init(void)
{
+ __unused_size_checks();
+
return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
__alignof__(struct vcpu_svm), THIS_MODULE);
}
#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
+#define MAX_DIRECT_ACCESS_MSRS 15
#define MSRPM_OFFSETS 16
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
struct vmcb *hsave;
u64 hsave_msr;
u64 vm_cr_msr;
- u64 vmcb;
- u32 host_intercept_exceptions;
+ u64 vmcb12_gpa;
/* These are the merged vectors */
u32 *msrpm;
*/
struct list_head ir_list;
spinlock_t ir_list_lock;
+
+ /* Save desired MSR intercept (read: pass-through) state */
+ struct {
+ DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS);
+ DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS);
+ } shadow_msr_intercept;
};
struct svm_cpu_data {
return svm->vmcb;
}
-static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
+static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_cr |= (1U << bit);
-
- recalc_intercepts(svm);
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ __set_bit(bit, (unsigned long *)&control->intercepts);
}
-static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
+static inline void vmcb_clr_intercept(struct vmcb_control_area *control, u32 bit)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_cr &= ~(1U << bit);
-
- recalc_intercepts(svm);
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ __clear_bit(bit, (unsigned long *)&control->intercepts);
}
-static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
+static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- return vmcb->control.intercept_cr & (1U << bit);
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ return test_bit(bit, (unsigned long *)&control->intercepts);
}
static inline void set_dr_intercepts(struct vcpu_svm *svm)
{
struct vmcb *vmcb = get_host_vmcb(svm);
- vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
- | (1 << INTERCEPT_DR1_READ)
- | (1 << INTERCEPT_DR2_READ)
- | (1 << INTERCEPT_DR3_READ)
- | (1 << INTERCEPT_DR4_READ)
- | (1 << INTERCEPT_DR5_READ)
- | (1 << INTERCEPT_DR6_READ)
- | (1 << INTERCEPT_DR7_READ)
- | (1 << INTERCEPT_DR0_WRITE)
- | (1 << INTERCEPT_DR1_WRITE)
- | (1 << INTERCEPT_DR2_WRITE)
- | (1 << INTERCEPT_DR3_WRITE)
- | (1 << INTERCEPT_DR4_WRITE)
- | (1 << INTERCEPT_DR5_WRITE)
- | (1 << INTERCEPT_DR6_WRITE)
- | (1 << INTERCEPT_DR7_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
recalc_intercepts(svm);
}
{
struct vmcb *vmcb = get_host_vmcb(svm);
- vmcb->control.intercept_dr = 0;
+ vmcb->control.intercepts[INTERCEPT_DR] = 0;
recalc_intercepts(svm);
}
-static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
+static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
{
struct vmcb *vmcb = get_host_vmcb(svm);
- vmcb->control.intercept_exceptions |= (1U << bit);
+ WARN_ON_ONCE(bit >= 32);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
recalc_intercepts(svm);
}
-static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
+static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
{
struct vmcb *vmcb = get_host_vmcb(svm);
- vmcb->control.intercept_exceptions &= ~(1U << bit);
+ WARN_ON_ONCE(bit >= 32);
+ vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
recalc_intercepts(svm);
}
{
struct vmcb *vmcb = get_host_vmcb(svm);
- vmcb->control.intercept |= (1ULL << bit);
+ vmcb_set_intercept(&vmcb->control, bit);
recalc_intercepts(svm);
}
{
struct vmcb *vmcb = get_host_vmcb(svm);
- vmcb->control.intercept &= ~(1ULL << bit);
+ vmcb_clr_intercept(&vmcb->control, bit);
recalc_intercepts(svm);
}
static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
{
- return (svm->vmcb->control.intercept & (1ULL << bit)) != 0;
+ return vmcb_is_intercept(&svm->vmcb->control, bit);
}
static inline bool vgif_enabled(struct vcpu_svm *svm)
/* svm.c */
#define MSR_CR3_LEGACY_RESERVED_MASK 0xfe7U
#define MSR_CR3_LEGACY_PAE_RESERVED_MASK 0x7U
-#define MSR_CR3_LONG_RESERVED_MASK 0xfff0000000000fe7U
+#define MSR_CR3_LONG_MBZ_MASK 0xfff0000000000000U
#define MSR_INVALID 0xffffffffU
u32 svm_msrpm_offset(u32 msr);
static inline bool nested_exit_on_smi(struct vcpu_svm *svm)
{
- return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_SMI));
+ return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SMI);
}
static inline bool nested_exit_on_intr(struct vcpu_svm *svm)
{
- return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INTR));
+ return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INTR);
}
static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
{
- return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_NMI));
+ return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
}
int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
* Tracepoint for guest mode entry.
*/
TRACE_EVENT(kvm_entry,
- TP_PROTO(unsigned int vcpu_id),
- TP_ARGS(vcpu_id),
+ TP_PROTO(struct kvm_vcpu *vcpu),
+ TP_ARGS(vcpu),
TP_STRUCT__entry(
__field( unsigned int, vcpu_id )
+ __field( unsigned long, rip )
),
TP_fast_assign(
- __entry->vcpu_id = vcpu_id;
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->rip = kvm_rip_read(vcpu);
),
- TP_printk("vcpu %u", __entry->vcpu_id)
+ TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip)
);
/*
(isa == KVM_ISA_VMX) ? \
__print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : ""
+#define TRACE_EVENT_KVM_EXIT(name) \
+TRACE_EVENT(name, \
+ TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), \
+ TP_ARGS(exit_reason, vcpu, isa), \
+ \
+ TP_STRUCT__entry( \
+ __field( unsigned int, exit_reason ) \
+ __field( unsigned long, guest_rip ) \
+ __field( u32, isa ) \
+ __field( u64, info1 ) \
+ __field( u64, info2 ) \
+ __field( u32, intr_info ) \
+ __field( u32, error_code ) \
+ __field( unsigned int, vcpu_id ) \
+ ), \
+ \
+ TP_fast_assign( \
+ __entry->exit_reason = exit_reason; \
+ __entry->guest_rip = kvm_rip_read(vcpu); \
+ __entry->isa = isa; \
+ __entry->vcpu_id = vcpu->vcpu_id; \
+ kvm_x86_ops.get_exit_info(vcpu, &__entry->info1, \
+ &__entry->info2, \
+ &__entry->intr_info, \
+ &__entry->error_code); \
+ ), \
+ \
+ TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx " \
+ "info2 0x%016llx intr_info 0x%08x error_code 0x%08x", \
+ __entry->vcpu_id, \
+ kvm_print_exit_reason(__entry->exit_reason, __entry->isa), \
+ __entry->guest_rip, __entry->info1, __entry->info2, \
+ __entry->intr_info, __entry->error_code) \
+)
+
/*
* Tracepoint for kvm guest exit:
*/
-TRACE_EVENT(kvm_exit,
- TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
- TP_ARGS(exit_reason, vcpu, isa),
-
- TP_STRUCT__entry(
- __field( unsigned int, exit_reason )
- __field( unsigned long, guest_rip )
- __field( u32, isa )
- __field( u64, info1 )
- __field( u64, info2 )
- __field( unsigned int, vcpu_id )
- ),
-
- TP_fast_assign(
- __entry->exit_reason = exit_reason;
- __entry->guest_rip = kvm_rip_read(vcpu);
- __entry->isa = isa;
- __entry->vcpu_id = vcpu->vcpu_id;
- kvm_x86_ops.get_exit_info(vcpu, &__entry->info1,
- &__entry->info2);
- ),
-
- TP_printk("vcpu %u reason %s%s%s rip 0x%lx info %llx %llx",
- __entry->vcpu_id,
- kvm_print_exit_reason(__entry->exit_reason, __entry->isa),
- __entry->guest_rip, __entry->info1, __entry->info2)
-);
+TRACE_EVENT_KVM_EXIT(kvm_exit);
/*
* Tracepoint for kvm interrupt injection:
);
TRACE_EVENT(kvm_nested_intercepts,
- TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept),
- TP_ARGS(cr_read, cr_write, exceptions, intercept),
+ TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions,
+ __u32 intercept1, __u32 intercept2, __u32 intercept3),
+ TP_ARGS(cr_read, cr_write, exceptions, intercept1,
+ intercept2, intercept3),
TP_STRUCT__entry(
__field( __u16, cr_read )
__field( __u16, cr_write )
__field( __u32, exceptions )
- __field( __u64, intercept )
+ __field( __u32, intercept1 )
+ __field( __u32, intercept2 )
+ __field( __u32, intercept3 )
),
TP_fast_assign(
__entry->cr_read = cr_read;
__entry->cr_write = cr_write;
__entry->exceptions = exceptions;
- __entry->intercept = intercept;
+ __entry->intercept1 = intercept1;
+ __entry->intercept2 = intercept2;
+ __entry->intercept3 = intercept3;
),
- TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx",
- __entry->cr_read, __entry->cr_write, __entry->exceptions,
- __entry->intercept)
+ TP_printk("cr_read: %04x cr_write: %04x excp: %08x "
+ "intercepts: %08x %08x %08x",
+ __entry->cr_read, __entry->cr_write, __entry->exceptions,
+ __entry->intercept1, __entry->intercept2, __entry->intercept3)
);
/*
* Tracepoint for #VMEXIT while nested
*/
-TRACE_EVENT(kvm_nested_vmexit,
- TP_PROTO(__u64 rip, __u32 exit_code,
- __u64 exit_info1, __u64 exit_info2,
- __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
- TP_ARGS(rip, exit_code, exit_info1, exit_info2,
- exit_int_info, exit_int_info_err, isa),
-
- TP_STRUCT__entry(
- __field( __u64, rip )
- __field( __u32, exit_code )
- __field( __u64, exit_info1 )
- __field( __u64, exit_info2 )
- __field( __u32, exit_int_info )
- __field( __u32, exit_int_info_err )
- __field( __u32, isa )
- ),
-
- TP_fast_assign(
- __entry->rip = rip;
- __entry->exit_code = exit_code;
- __entry->exit_info1 = exit_info1;
- __entry->exit_info2 = exit_info2;
- __entry->exit_int_info = exit_int_info;
- __entry->exit_int_info_err = exit_int_info_err;
- __entry->isa = isa;
- ),
- TP_printk("rip: 0x%016llx reason: %s%s%s ext_inf1: 0x%016llx "
- "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
- __entry->rip,
- kvm_print_exit_reason(__entry->exit_code, __entry->isa),
- __entry->exit_info1, __entry->exit_info2,
- __entry->exit_int_info, __entry->exit_int_info_err)
-);
+TRACE_EVENT_KVM_EXIT(kvm_nested_vmexit);
/*
* Tracepoint for #VMEXIT reinjected to the guest
static inline bool cpu_has_vmx_rdtscp(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_RDTSCP;
+ SECONDARY_EXEC_ENABLE_RDTSCP;
}
static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
}
-static inline bool vmx_rdrand_supported(void)
+static inline bool cpu_has_vmx_rdrand(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_RDRAND_EXITING;
SECONDARY_EXEC_ENCLS_EXITING;
}
-static inline bool vmx_rdseed_supported(void)
+static inline bool cpu_has_vmx_rdseed(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_RDSEED_EXITING;
return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
}
-static inline bool vmx_xsaves_supported(void)
+static inline bool cpu_has_vmx_xsaves(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_XSAVES;
}
-static inline bool vmx_waitpkg_supported(void)
+static inline bool cpu_has_vmx_waitpkg(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
vmx->nested.hv_evmcs = NULL;
}
+static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
+ struct loaded_vmcs *prev)
+{
+ struct vmcs_host_state *dest, *src;
+
+ if (unlikely(!vmx->guest_state_loaded))
+ return;
+
+ src = &prev->host_state;
+ dest = &vmx->loaded_vmcs->host_state;
+
+ vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
+ dest->ldt_sel = src->ldt_sel;
+#ifdef CONFIG_X86_64
+ dest->ds_sel = src->ds_sel;
+ dest->es_sel = src->es_sel;
+#endif
+}
+
+static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct loaded_vmcs *prev;
+ int cpu;
+
+ if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs))
+ return;
+
+ cpu = get_cpu();
+ prev = vmx->loaded_vmcs;
+ vmx->loaded_vmcs = vmcs;
+ vmx_vcpu_load_vmcs(vcpu, cpu, prev);
+ vmx_sync_vmcs_host_state(vmx, prev);
+ put_cpu();
+
+ vmx_register_cache_reset(vcpu);
+}
+
/*
* Free whatever needs to be freed from vmx->nested when L1 goes down, or
* just stops using VMX.
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01))
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+
if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
return;
- kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
vmx->nested.vmxon = false;
vmx->nested.smm.vmxon = false;
free_loaded_vmcs(&vmx->nested.vmcs02);
}
-static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
- struct loaded_vmcs *prev)
-{
- struct vmcs_host_state *dest, *src;
-
- if (unlikely(!vmx->guest_state_loaded))
- return;
-
- src = &prev->host_state;
- dest = &vmx->loaded_vmcs->host_state;
-
- vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
- dest->ldt_sel = src->ldt_sel;
-#ifdef CONFIG_X86_64
- dest->ds_sel = src->ds_sel;
- dest->es_sel = src->es_sel;
-#endif
-}
-
-static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct loaded_vmcs *prev;
- int cpu;
-
- if (vmx->loaded_vmcs == vmcs)
- return;
-
- cpu = get_cpu();
- prev = vmx->loaded_vmcs;
- vmx->loaded_vmcs = vmcs;
- vmx_vcpu_load_vmcs(vcpu, cpu, prev);
- vmx_sync_vmcs_host_state(vmx, prev);
- put_cpu();
-
- vmx_register_cache_reset(vcpu);
-}
-
/*
* Ensure that the current vmcs of the logical processor is the
* vmcs01 of the vcpu before calling free_nested().
{
vcpu_load(vcpu);
vmx_leave_nested(vcpu);
- vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
- free_nested(vcpu);
vcpu_put(vcpu);
}
* VM-exit in L0, use the more accurate value.
*/
if (msr_index == MSR_IA32_TSC) {
- int index = vmx_find_msr_index(&vmx->msr_autostore.guest,
- MSR_IA32_TSC);
+ int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest,
+ MSR_IA32_TSC);
- if (index >= 0) {
- u64 val = vmx->msr_autostore.guest.val[index].value;
+ if (i >= 0) {
+ u64 val = vmx->msr_autostore.guest.val[i].value;
*data = kvm_read_l1_tsc(vcpu, val);
return true;
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
bool in_vmcs12_store_list;
- int msr_autostore_index;
+ int msr_autostore_slot;
bool in_autostore_list;
int last;
- msr_autostore_index = vmx_find_msr_index(autostore, msr_index);
- in_autostore_list = msr_autostore_index >= 0;
+ msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index);
+ in_autostore_list = msr_autostore_slot >= 0;
in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
if (in_vmcs12_store_list && !in_autostore_list) {
- if (autostore->nr == NR_LOADSTORE_MSRS) {
+ if (autostore->nr == MAX_NR_LOADSTORE_MSRS) {
/*
* Emulated VMEntry does not fail here. Instead a less
* accurate value will be returned by
autostore->val[last].index = msr_index;
} else if (!in_vmcs12_store_list && in_autostore_list) {
last = --autostore->nr;
- autostore->val[msr_autostore_index] = autostore->val[last];
+ autostore->val[msr_autostore_slot] = autostore->val[last];
}
}
/* Take the following fields only from vmcs12 */
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_ENABLE_INVPCID |
- SECONDARY_EXEC_RDTSCP |
+ SECONDARY_EXEC_ENABLE_RDTSCP |
SECONDARY_EXEC_XSAVES |
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
vmcs_write16(GUEST_INTR_STATUS,
vmcs12->guest_intr_status);
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
+ exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+
secondary_exec_controls_set(vmx, exec_control);
}
vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
+
+ vmx->segment_cache.bitmask = 0;
}
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
* which means L1 attempted VMEntry to L2 with invalid state.
* Fail the VMEntry.
*/
- if (vmx->emulation_required) {
+ if (CC(!vmx_guest_state_valid(vcpu))) {
*entry_failure_code = ENTRY_FAIL_DEFAULT;
return -EINVAL;
}
prepare_vmcs02_early(vmx, vmcs12);
if (from_vmentry) {
- if (unlikely(!nested_get_vmcs12_pages(vcpu)))
+ if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
+ }
if (nested_vmx_check_vmentry_hw(vcpu)) {
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
* to nested_get_vmcs12_pages before the next VM-entry. The MSRs
* have already been set at vmentry time and should not be reset.
*/
- kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
}
/*
if (evmptrld_status == EVMPTRLD_ERROR) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
- } else if (evmptrld_status == EVMPTRLD_VMFAIL) {
+ } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) {
return nested_vmx_failInvalid(vcpu);
}
- if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
+ if (CC(!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull))
return nested_vmx_failInvalid(vcpu);
vmcs12 = get_vmcs12(vcpu);
* rather than RFLAGS.ZF, and no error number is stored to the
* VM-instruction error field.
*/
- if (vmcs12->hdr.shadow_vmcs)
+ if (CC(vmcs12->hdr.shadow_vmcs))
return nested_vmx_failInvalid(vcpu);
if (vmx->nested.hv_evmcs) {
* for misconfigurations which will anyway be caught by the processor
* when using the merged vmcs02.
*/
- if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
+ if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS))
return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
- if (vmcs12->launch_state == launch)
+ if (CC(vmcs12->launch_state == launch))
return nested_vmx_fail(vcpu,
launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
: VMXERR_VMRESUME_NONLAUNCHED_VMCS);
if (unlikely(status != NVMX_VMENTRY_SUCCESS))
goto vmentry_failed;
+ /* Emulate processing of posted interrupts on VM-Enter. */
+ if (nested_cpu_has_posted_intr(vmcs12) &&
+ kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) {
+ vmx->nested.pi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv);
+ }
+
/* Hide L1D cache contents from the nested guest. */
vmx->vcpu.arch.l1tf_flush_l1d = true;
static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
{
- struct shared_msr_entry *efer_msr;
+ struct vmx_uret_msr *efer_msr;
unsigned int i;
if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
return vmx->msr_autoload.guest.val[i].value;
}
- efer_msr = find_msr_entry(vmx, MSR_EFER);
+ efer_msr = vmx_find_uret_msr(vmx, MSR_EFER);
if (efer_msr)
return efer_msr->data;
if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
kvm_vcpu_flush_tlb_current(vcpu);
+ /*
+ * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
+ * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are
+ * up-to-date before switching to L1.
+ */
+ if (enable_ept && is_pae_paging(vcpu))
+ vmx_ept_load_pdptrs(vcpu);
+
leave_guest_mode(vcpu);
if (nested_cpu_has_preemption_timer(vmcs12))
vmx->nested.msrs.entry_ctls_high &=
~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
vmx->nested.msrs.exit_ctls_high &=
- ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
}
}
r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
if (r != X86EMUL_CONTINUE) {
- *ret = vmx_handle_memory_failure(vcpu, r, &e);
+ *ret = kvm_handle_memory_failure(vcpu, r, &e);
return -EINVAL;
}
if (vmx_pt_mode_is_host_guest()) {
vmx->pt_desc.guest.ctl = 0;
- pt_update_intercept_for_msr(vmx);
+ pt_update_intercept_for_msr(vcpu);
}
return 0;
/* _system ok, nested_vmx_check_permission has verified cpl=0 */
r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e);
if (r != X86EMUL_CONTINUE)
- return vmx_handle_memory_failure(vcpu, r, &e);
+ return kvm_handle_memory_failure(vcpu, r, &e);
}
return nested_vmx_succeed(vcpu);
return 1;
r = kvm_read_guest_virt(vcpu, gva, &value, len, &e);
if (r != X86EMUL_CONTINUE)
- return vmx_handle_memory_failure(vcpu, r, &e);
+ return kvm_handle_memory_failure(vcpu, r, &e);
}
field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr,
sizeof(gpa_t), &e);
if (r != X86EMUL_CONTINUE)
- return vmx_handle_memory_failure(vcpu, r, &e);
+ return kvm_handle_memory_failure(vcpu, r, &e);
return nested_vmx_succeed(vcpu);
}
return 1;
r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
if (r != X86EMUL_CONTINUE)
- return vmx_handle_memory_failure(vcpu, r, &e);
+ return kvm_handle_memory_failure(vcpu, r, &e);
/*
* Nested EPT roots are always held through guest_mmu,
return 1;
r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
if (r != X86EMUL_CONTINUE)
- return vmx_handle_memory_failure(vcpu, r, &e);
+ return kvm_handle_memory_failure(vcpu, r, &e);
if (operand.vpid >> 16)
return nested_vmx_fail(vcpu,
goto reflect_vmexit;
}
- exit_intr_info = vmx_get_intr_info(vcpu);
- exit_qual = vmx_get_exit_qual(vcpu);
-
- trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, exit_qual,
- vmx->idt_vectoring_info, exit_intr_info,
- vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
- KVM_ISA_VMX);
+ trace_kvm_nested_vmexit(exit_reason, vcpu, KVM_ISA_VMX);
/* If L0 (KVM) wants the exit, it trumps L1's desires. */
if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
* need to be synthesized by querying the in-kernel LAPIC, but external
* interrupts are never reflected to L1 so it's a non-issue.
*/
- if ((exit_intr_info &
- (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
- (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
+ exit_intr_info = vmx_get_intr_info(vcpu);
+ if (is_exception_with_error_code(exit_intr_info)) {
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
vmcs12->vm_exit_intr_error_code =
vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
}
+ exit_qual = vmx_get_exit_qual(vcpu);
reflect_vmexit:
nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual);
* restored yet. EVMCS will be mapped from
* nested_get_vmcs12_pages().
*/
- kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
} else {
return -EINVAL;
}
#ifdef CONFIG_X86_64
VM_EXIT_HOST_ADDR_SPACE_SIZE |
#endif
- VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+ VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
+ VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
msrs->exit_ctls_high |=
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
#ifdef CONFIG_X86_64
VM_ENTRY_IA32E_MODE |
#endif
- VM_ENTRY_LOAD_IA32_PAT;
+ VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
msrs->entry_ctls_high |=
(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
msrs->secondary_ctls_low = 0;
msrs->secondary_ctls_high &=
SECONDARY_EXEC_DESC |
- SECONDARY_EXEC_RDTSCP |
+ SECONDARY_EXEC_ENABLE_RDTSCP |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
.hv_timer_pending = nested_vmx_preemption_timer_pending,
.get_state = vmx_get_nested_state,
.set_state = vmx_set_nested_state,
- .get_vmcs12_pages = nested_get_vmcs12_pages,
+ .get_nested_state_pages = nested_get_vmcs12_pages,
.write_log_dirty = nested_vmx_write_pml_buffer,
.enable_evmcs = nested_enable_evmcs,
.get_evmcs_version = nested_get_evmcs_version,
+++ /dev/null
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __KVM_X86_VMX_INSN_H
-#define __KVM_X86_VMX_INSN_H
-
-#include <linux/nospec.h>
-
-#include <asm/kvm_host.h>
-#include <asm/vmx.h>
-
-#include "evmcs.h"
-#include "vmcs.h"
-
-#define __ex(x) __kvm_handle_fault_on_reboot(x)
-
-asmlinkage void vmread_error(unsigned long field, bool fault);
-__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field,
- bool fault);
-void vmwrite_error(unsigned long field, unsigned long value);
-void vmclear_error(struct vmcs *vmcs, u64 phys_addr);
-void vmptrld_error(struct vmcs *vmcs, u64 phys_addr);
-void invvpid_error(unsigned long ext, u16 vpid, gva_t gva);
-void invept_error(unsigned long ext, u64 eptp, gpa_t gpa);
-
-static __always_inline void vmcs_check16(unsigned long field)
-{
- BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
- "16-bit accessor invalid for 64-bit field");
- BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
- "16-bit accessor invalid for 64-bit high field");
- BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
- "16-bit accessor invalid for 32-bit high field");
- BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
- "16-bit accessor invalid for natural width field");
-}
-
-static __always_inline void vmcs_check32(unsigned long field)
-{
- BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
- "32-bit accessor invalid for 16-bit field");
- BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
- "32-bit accessor invalid for natural width field");
-}
-
-static __always_inline void vmcs_check64(unsigned long field)
-{
- BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
- "64-bit accessor invalid for 16-bit field");
- BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
- "64-bit accessor invalid for 64-bit high field");
- BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
- "64-bit accessor invalid for 32-bit field");
- BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
- "64-bit accessor invalid for natural width field");
-}
-
-static __always_inline void vmcs_checkl(unsigned long field)
-{
- BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
- "Natural width accessor invalid for 16-bit field");
- BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
- "Natural width accessor invalid for 64-bit field");
- BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
- "Natural width accessor invalid for 64-bit high field");
- BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
- "Natural width accessor invalid for 32-bit field");
-}
-
-static __always_inline unsigned long __vmcs_readl(unsigned long field)
-{
- unsigned long value;
-
- asm volatile("1: vmread %2, %1\n\t"
- ".byte 0x3e\n\t" /* branch taken hint */
- "ja 3f\n\t"
-
- /*
- * VMREAD failed. Push '0' for @fault, push the failing
- * @field, and bounce through the trampoline to preserve
- * volatile registers.
- */
- "push $0\n\t"
- "push %2\n\t"
- "2:call vmread_error_trampoline\n\t"
-
- /*
- * Unwind the stack. Note, the trampoline zeros out the
- * memory for @fault so that the result is '0' on error.
- */
- "pop %2\n\t"
- "pop %1\n\t"
- "3:\n\t"
-
- /* VMREAD faulted. As above, except push '1' for @fault. */
- ".pushsection .fixup, \"ax\"\n\t"
- "4: push $1\n\t"
- "push %2\n\t"
- "jmp 2b\n\t"
- ".popsection\n\t"
- _ASM_EXTABLE(1b, 4b)
- : ASM_CALL_CONSTRAINT, "=r"(value) : "r"(field) : "cc");
- return value;
-}
-
-static __always_inline u16 vmcs_read16(unsigned long field)
-{
- vmcs_check16(field);
- if (static_branch_unlikely(&enable_evmcs))
- return evmcs_read16(field);
- return __vmcs_readl(field);
-}
-
-static __always_inline u32 vmcs_read32(unsigned long field)
-{
- vmcs_check32(field);
- if (static_branch_unlikely(&enable_evmcs))
- return evmcs_read32(field);
- return __vmcs_readl(field);
-}
-
-static __always_inline u64 vmcs_read64(unsigned long field)
-{
- vmcs_check64(field);
- if (static_branch_unlikely(&enable_evmcs))
- return evmcs_read64(field);
-#ifdef CONFIG_X86_64
- return __vmcs_readl(field);
-#else
- return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
-#endif
-}
-
-static __always_inline unsigned long vmcs_readl(unsigned long field)
-{
- vmcs_checkl(field);
- if (static_branch_unlikely(&enable_evmcs))
- return evmcs_read64(field);
- return __vmcs_readl(field);
-}
-
-#define vmx_asm1(insn, op1, error_args...) \
-do { \
- asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \
- ".byte 0x2e\n\t" /* branch not taken hint */ \
- "jna %l[error]\n\t" \
- _ASM_EXTABLE(1b, %l[fault]) \
- : : op1 : "cc" : error, fault); \
- return; \
-error: \
- instrumentation_begin(); \
- insn##_error(error_args); \
- instrumentation_end(); \
- return; \
-fault: \
- kvm_spurious_fault(); \
-} while (0)
-
-#define vmx_asm2(insn, op1, op2, error_args...) \
-do { \
- asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \
- ".byte 0x2e\n\t" /* branch not taken hint */ \
- "jna %l[error]\n\t" \
- _ASM_EXTABLE(1b, %l[fault]) \
- : : op1, op2 : "cc" : error, fault); \
- return; \
-error: \
- instrumentation_begin(); \
- insn##_error(error_args); \
- instrumentation_end(); \
- return; \
-fault: \
- kvm_spurious_fault(); \
-} while (0)
-
-static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
-{
- vmx_asm2(vmwrite, "r"(field), "rm"(value), field, value);
-}
-
-static __always_inline void vmcs_write16(unsigned long field, u16 value)
-{
- vmcs_check16(field);
- if (static_branch_unlikely(&enable_evmcs))
- return evmcs_write16(field, value);
-
- __vmcs_writel(field, value);
-}
-
-static __always_inline void vmcs_write32(unsigned long field, u32 value)
-{
- vmcs_check32(field);
- if (static_branch_unlikely(&enable_evmcs))
- return evmcs_write32(field, value);
-
- __vmcs_writel(field, value);
-}
-
-static __always_inline void vmcs_write64(unsigned long field, u64 value)
-{
- vmcs_check64(field);
- if (static_branch_unlikely(&enable_evmcs))
- return evmcs_write64(field, value);
-
- __vmcs_writel(field, value);
-#ifndef CONFIG_X86_64
- __vmcs_writel(field+1, value >> 32);
-#endif
-}
-
-static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
-{
- vmcs_checkl(field);
- if (static_branch_unlikely(&enable_evmcs))
- return evmcs_write64(field, value);
-
- __vmcs_writel(field, value);
-}
-
-static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
-{
- BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
- "vmcs_clear_bits does not support 64-bit fields");
- if (static_branch_unlikely(&enable_evmcs))
- return evmcs_write32(field, evmcs_read32(field) & ~mask);
-
- __vmcs_writel(field, __vmcs_readl(field) & ~mask);
-}
-
-static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
-{
- BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
- "vmcs_set_bits does not support 64-bit fields");
- if (static_branch_unlikely(&enable_evmcs))
- return evmcs_write32(field, evmcs_read32(field) | mask);
-
- __vmcs_writel(field, __vmcs_readl(field) | mask);
-}
-
-static inline void vmcs_clear(struct vmcs *vmcs)
-{
- u64 phys_addr = __pa(vmcs);
-
- vmx_asm1(vmclear, "m"(phys_addr), vmcs, phys_addr);
-}
-
-static inline void vmcs_load(struct vmcs *vmcs)
-{
- u64 phys_addr = __pa(vmcs);
-
- if (static_branch_unlikely(&enable_evmcs))
- return evmcs_load(phys_addr);
-
- vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr);
-}
-
-static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
-{
- struct {
- u64 vpid : 16;
- u64 rsvd : 48;
- u64 gva;
- } operand = { vpid, 0, gva };
-
- vmx_asm2(invvpid, "r"(ext), "m"(operand), ext, vpid, gva);
-}
-
-static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
-{
- struct {
- u64 eptp, gpa;
- } operand = {eptp, gpa};
-
- vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa);
-}
-
-static inline void vpid_sync_vcpu_single(int vpid)
-{
- if (vpid == 0)
- return;
-
- __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
-}
-
-static inline void vpid_sync_vcpu_global(void)
-{
- __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
-}
-
-static inline void vpid_sync_context(int vpid)
-{
- if (cpu_has_vmx_invvpid_single())
- vpid_sync_vcpu_single(vpid);
- else if (vpid != 0)
- vpid_sync_vcpu_global();
-}
-
-static inline void vpid_sync_vcpu_addr(int vpid, gva_t addr)
-{
- if (vpid == 0)
- return;
-
- if (cpu_has_vmx_invvpid_individual_addr())
- __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
- else
- vpid_sync_context(vpid);
-}
-
-static inline void ept_sync_global(void)
-{
- __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
-}
-
-static inline void ept_sync_context(u64 eptp)
-{
- if (cpu_has_vmx_invept_context())
- __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
- else
- ept_sync_global();
-}
-
-#endif /* __KVM_X86_VMX_INSN_H */
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kvm_host.h>
+
+#include <asm/irq_remapping.h>
+#include <asm/cpu.h>
+
+#include "lapic.h"
+#include "posted_intr.h"
+#include "trace.h"
+#include "vmx.h"
+
+/*
+ * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
+ * can find which vCPU should be waken up.
+ */
+static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
+static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+
+static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+{
+ return &(to_vmx(vcpu)->pi_desc);
+}
+
+void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+
+ /*
+ * In case of hot-plug or hot-unplug, we may have to undo
+ * vmx_vcpu_pi_put even if there is no assigned device. And we
+ * always keep PI.NDST up to date for simplicity: it makes the
+ * code easier, and CPU migration is not a fast path.
+ */
+ if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
+ return;
+
+ /*
+ * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
+ * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
+ * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
+ * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
+ * correctly.
+ */
+ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
+ pi_clear_sn(pi_desc);
+ goto after_clear_sn;
+ }
+
+ /* The full case. */
+ do {
+ old.control = new.control = pi_desc->control;
+
+ dest = cpu_physical_id(cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ new.sn = 0;
+ } while (cmpxchg64(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+after_clear_sn:
+
+ /*
+ * Clear SN before reading the bitmap. The VT-d firmware
+ * writes the bitmap and reads SN atomically (5.2.3 in the
+ * spec), so it doesn't really have a memory barrier that
+ * pairs with this, but we cannot do that and we need one.
+ */
+ smp_mb__after_atomic();
+
+ if (!pi_is_pir_empty(pi_desc))
+ pi_set_on(pi_desc);
+}
+
+void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP) ||
+ !kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ /* Set SN when the vCPU is preempted */
+ if (vcpu->preempted)
+ pi_set_sn(pi_desc);
+}
+
+static void __pi_post_block(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+
+ do {
+ old.control = new.control = pi_desc->control;
+ WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
+ "Wakeup handler not enabled while the VCPU is blocked\n");
+
+ dest = cpu_physical_id(vcpu->cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* set 'NV' to 'notification vector' */
+ new.nv = POSTED_INTR_VECTOR;
+ } while (cmpxchg64(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+ if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ list_del(&vcpu->blocked_vcpu_list);
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ vcpu->pre_pcpu = -1;
+ }
+}
+
+/*
+ * This routine does the following things for vCPU which is going
+ * to be blocked if VT-d PI is enabled.
+ * - Store the vCPU to the wakeup list, so when interrupts happen
+ * we can find the right vCPU to wake up.
+ * - Change the Posted-interrupt descriptor as below:
+ * 'NDST' <-- vcpu->pre_pcpu
+ * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
+ * - If 'ON' is set during this process, which means at least one
+ * interrupt is posted for this vCPU, we cannot block it, in
+ * this case, return 1, otherwise, return 0.
+ *
+ */
+int pi_pre_block(struct kvm_vcpu *vcpu)
+{
+ unsigned int dest;
+ struct pi_desc old, new;
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP) ||
+ !kvm_vcpu_apicv_active(vcpu))
+ return 0;
+
+ WARN_ON(irqs_disabled());
+ local_irq_disable();
+ if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
+ vcpu->pre_pcpu = vcpu->cpu;
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ list_add_tail(&vcpu->blocked_vcpu_list,
+ &per_cpu(blocked_vcpu_on_cpu,
+ vcpu->pre_pcpu));
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ }
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ WARN((pi_desc->sn == 1),
+ "Warning: SN field of posted-interrupts "
+ "is set before blocking\n");
+
+ /*
+ * Since vCPU can be preempted during this process,
+ * vcpu->cpu could be different with pre_pcpu, we
+ * need to set pre_pcpu as the destination of wakeup
+ * notification event, then we can find the right vCPU
+ * to wakeup in wakeup handler if interrupts happen
+ * when the vCPU is in blocked state.
+ */
+ dest = cpu_physical_id(vcpu->pre_pcpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* set 'NV' to 'wakeup vector' */
+ new.nv = POSTED_INTR_WAKEUP_VECTOR;
+ } while (cmpxchg64(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+ /* We should not block the vCPU if an interrupt is posted for it. */
+ if (pi_test_on(pi_desc) == 1)
+ __pi_post_block(vcpu);
+
+ local_irq_enable();
+ return (vcpu->pre_pcpu == -1);
+}
+
+void pi_post_block(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->pre_pcpu == -1)
+ return;
+
+ WARN_ON(irqs_disabled());
+ local_irq_disable();
+ __pi_post_block(vcpu);
+ local_irq_enable();
+}
+
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+void pi_wakeup_handler(void)
+{
+ struct kvm_vcpu *vcpu;
+ int cpu = smp_processor_id();
+
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
+ blocked_vcpu_list) {
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (pi_test_on(pi_desc) == 1)
+ kvm_vcpu_kick(vcpu);
+ }
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+}
+
+void __init pi_init(int cpu)
+{
+ INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+ spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+}
+
+bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ return pi_test_on(pi_desc) ||
+ (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
+}
+
+
+/*
+ * pi_update_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
+ bool set)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu;
+ struct vcpu_data vcpu_info;
+ int idx, ret = 0;
+
+ if (!kvm_arch_has_assigned_device(kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP) ||
+ !kvm_vcpu_apicv_active(kvm->vcpus[0]))
+ return 0;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ if (guest_irq >= irq_rt->nr_rt_entries ||
+ hlist_empty(&irq_rt->map[guest_irq])) {
+ pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
+ guest_irq, irq_rt->nr_rt_entries);
+ goto out;
+ }
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ if (e->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+ /*
+ * VT-d PI cannot support posting multicast/broadcast
+ * interrupts to a vCPU, we still use interrupt remapping
+ * for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ *
+ * We will support full lowest-priority interrupt later.
+ *
+ * In addition, we can only inject generic interrupts using
+ * the PI mechanism, refuse to route others through it.
+ */
+
+ kvm_set_msi_irq(kvm, e, &irq);
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+ !kvm_irq_is_postable(&irq)) {
+ /*
+ * Make sure the IRTE is in remapped mode if
+ * we don't handle it in posted mode.
+ */
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+ if (ret < 0) {
+ printk(KERN_INFO
+ "failed to back to remapped mode, irq: %u\n",
+ host_irq);
+ goto out;
+ }
+
+ continue;
+ }
+
+ vcpu_info.pi_desc_addr = __pa(&to_vmx(vcpu)->pi_desc);
+ vcpu_info.vector = irq.vector;
+
+ trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
+ vcpu_info.vector, vcpu_info.pi_desc_addr, set);
+
+ if (set)
+ ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
+ else
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+
+ if (ret < 0) {
+ printk(KERN_INFO "%s: failed to update PI IRTE\n",
+ __func__);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_POSTED_INTR_H
+#define __KVM_X86_VMX_POSTED_INTR_H
+
+#define POSTED_INTR_ON 0
+#define POSTED_INTR_SN 1
+
+/* Posted-Interrupt Descriptor */
+struct pi_desc {
+ u32 pir[8]; /* Posted interrupt requested */
+ union {
+ struct {
+ /* bit 256 - Outstanding Notification */
+ u16 on : 1,
+ /* bit 257 - Suppress Notification */
+ sn : 1,
+ /* bit 271:258 - Reserved */
+ rsvd_1 : 14;
+ /* bit 279:272 - Notification Vector */
+ u8 nv;
+ /* bit 287:280 - Reserved */
+ u8 rsvd_2;
+ /* bit 319:288 - Notification Destination */
+ u32 ndst;
+ };
+ u64 control;
+ };
+ u32 rsvd[6];
+} __aligned(64);
+
+static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
+{
+ return test_and_set_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
+{
+ return test_and_clear_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
+{
+ return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
+static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
+{
+ return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
+}
+
+static inline void pi_set_sn(struct pi_desc *pi_desc)
+{
+ set_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_set_on(struct pi_desc *pi_desc)
+{
+ set_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_clear_on(struct pi_desc *pi_desc)
+{
+ clear_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+ clear_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_on(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_sn(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
+void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
+int pi_pre_block(struct kvm_vcpu *vcpu);
+void pi_post_block(struct kvm_vcpu *vcpu);
+void pi_wakeup_handler(void);
+void __init pi_init(int cpu);
+bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
+int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
+ bool set);
+
+#endif /* __KVM_X86_VMX_POSTED_INTR_H */
\ No newline at end of file
return is_intr_type(intr_info, INTR_TYPE_EXT_INTR);
}
+static inline bool is_exception_with_error_code(u32 intr_info)
+{
+ const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK;
+
+ return (intr_info & mask) == mask;
+}
+
enum vmcs_field_width {
VMCS_FIELD_WIDTH_U16 = 0,
VMCS_FIELD_WIDTH_U64 = 1,
#include <asm/bitsperlong.h>
#include <asm/kvm_vcpu_regs.h>
#include <asm/nospec-branch.h>
+#include <asm/segment.h>
#define WORD_SIZE (BITS_PER_LONG / 8)
ret
SYM_FUNC_END(vmread_error_trampoline)
+
+SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
+ /*
+ * Unconditionally create a stack frame, getting the correct RSP on the
+ * stack (for x86-64) would take two instructions anyways, and RBP can
+ * be used to restore RSP to make objtool happy (see below).
+ */
+ push %_ASM_BP
+ mov %_ASM_SP, %_ASM_BP
+
+#ifdef CONFIG_X86_64
+ /*
+ * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
+ * creating the synthetic interrupt stack frame for the IRQ/NMI.
+ */
+ and $-16, %rsp
+ push $__KERNEL_DS
+ push %rbp
+#endif
+ pushf
+ push $__KERNEL_CS
+ CALL_NOSPEC _ASM_ARG1
+
+ /*
+ * "Restore" RSP from RBP, even though IRET has already unwound RSP to
+ * the correct value. objtool doesn't know the callee will IRET and,
+ * without the explicit restore, thinks the stack is getting walloped.
+ * Using an unwind hint is problematic due to x86-64's dynamic alignment.
+ */
+ mov %_ASM_BP, %_ASM_SP
+ pop %_ASM_BP
+ ret
+SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff)
#include "lapic.h"
#include "mmu.h"
#include "nested.h"
-#include "ops.h"
#include "pmu.h"
#include "trace.h"
#include "vmcs.h"
RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
RTIT_STATUS_BYTECNT))
-#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
- (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
+/*
+ * List of MSRs that can be directly passed to the guest.
+ * In addition to these x2apic and PT MSRs are handled specially.
+ */
+static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
+ MSR_IA32_SPEC_CTRL,
+ MSR_IA32_PRED_CMD,
+ MSR_IA32_TSC,
+ MSR_FS_BASE,
+ MSR_GS_BASE,
+ MSR_KERNEL_GS_BASE,
+ MSR_IA32_SYSENTER_CS,
+ MSR_IA32_SYSENTER_ESP,
+ MSR_IA32_SYSENTER_EIP,
+ MSR_CORE_C1_RES,
+ MSR_CORE_C3_RESIDENCY,
+ MSR_CORE_C6_RESIDENCY,
+ MSR_CORE_C7_RESIDENCY,
+};
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
};
module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
-static bool guest_state_valid(struct kvm_vcpu *vcpu);
static u32 vmx_segment_access_rights(struct kvm_segment *var);
-static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
u32 msr, int type);
void vmx_vmexit(void);
*/
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
-/*
- * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
- * can find which vCPU should be waken up.
- */
-static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
-static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
-
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
* will emulate SYSCALL in legacy mode if the vendor string in guest
* CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
* support this emulation, IA32_STAR must always be included in
- * vmx_msr_index[], even in i386 builds.
+ * vmx_uret_msrs_list[], even in i386 builds.
*/
-const u32 vmx_msr_index[] = {
+static const u32 vmx_uret_msrs_list[] = {
#ifdef CONFIG_X86_64
MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
#endif
return flexpriority_enabled;
}
-static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
+static int possible_passthrough_msr_slot(u32 msr)
+{
+ u32 i;
+
+ for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++)
+ if (vmx_possible_passthrough_msrs[i] == msr)
+ return i;
+
+ return -ENOENT;
+}
+
+static bool is_valid_passthrough_msr(u32 msr)
+{
+ bool r;
+
+ switch (msr) {
+ case 0x800 ... 0x8ff:
+ /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
+ return true;
+ case MSR_IA32_RTIT_STATUS:
+ case MSR_IA32_RTIT_OUTPUT_BASE:
+ case MSR_IA32_RTIT_OUTPUT_MASK:
+ case MSR_IA32_RTIT_CR3_MATCH:
+ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+ /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
+ return true;
+ }
+
+ r = possible_passthrough_msr_slot(msr) != -ENOENT;
+
+ WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
+
+ return r;
+}
+
+static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{
int i;
- for (i = 0; i < vmx->nmsrs; ++i)
- if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
+ for (i = 0; i < vmx->nr_uret_msrs; ++i)
+ if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr)
return i;
return -1;
}
-struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
+struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{
int i;
- i = __find_msr_index(vmx, msr);
+ i = __vmx_find_uret_msr(vmx, msr);
if (i >= 0)
- return &vmx->guest_msrs[i];
+ return &vmx->guest_uret_msrs[i];
return NULL;
}
-static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, u64 data)
+static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
+ struct vmx_uret_msr *msr, u64 data)
{
int ret = 0;
u64 old_msr_data = msr->data;
msr->data = data;
- if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
+ if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) {
preempt_disable();
- ret = kvm_set_shared_msr(msr->index, msr->data,
- msr->mask);
+ ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask);
preempt_enable();
if (ret)
msr->data = old_msr_data;
vm_exit_controls_clearbit(vmx, exit);
}
-int vmx_find_msr_index(struct vmx_msrs *m, u32 msr)
+int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
{
unsigned int i;
}
break;
}
- i = vmx_find_msr_index(&m->guest, msr);
+ i = vmx_find_loadstore_msr_slot(&m->guest, msr);
if (i < 0)
goto skip_guest;
--m->guest.nr;
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
skip_guest:
- i = vmx_find_msr_index(&m->host, msr);
+ i = vmx_find_loadstore_msr_slot(&m->host, msr);
if (i < 0)
return;
wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
}
- i = vmx_find_msr_index(&m->guest, msr);
+ i = vmx_find_loadstore_msr_slot(&m->guest, msr);
if (!entry_only)
- j = vmx_find_msr_index(&m->host, msr);
+ j = vmx_find_loadstore_msr_slot(&m->host, msr);
- if ((i < 0 && m->guest.nr == NR_LOADSTORE_MSRS) ||
- (j < 0 && m->host.nr == NR_LOADSTORE_MSRS)) {
+ if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
+ (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
printk_once(KERN_WARNING "Not enough msr switch entries. "
"Can't add msr %x\n", msr);
return;
m->host.val[j].value = host_val;
}
-static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
+static bool update_transition_efer(struct vcpu_vmx *vmx)
{
u64 guest_efer = vmx->vcpu.arch.efer;
u64 ignore_bits = 0;
+ int i;
/* Shadow paging assumes NX to be available. */
if (!enable_ept)
else
clear_atomic_switch_msr(vmx, MSR_EFER);
return false;
- } else {
- clear_atomic_switch_msr(vmx, MSR_EFER);
+ }
- guest_efer &= ~ignore_bits;
- guest_efer |= host_efer & ignore_bits;
+ i = __vmx_find_uret_msr(vmx, MSR_EFER);
+ if (i < 0)
+ return false;
- vmx->guest_msrs[efer_offset].data = guest_efer;
- vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+ clear_atomic_switch_msr(vmx, MSR_EFER);
- return true;
- }
+ guest_efer &= ~ignore_bits;
+ guest_efer |= host_efer & ignore_bits;
+
+ vmx->guest_uret_msrs[i].data = guest_efer;
+ vmx->guest_uret_msrs[i].mask = ~ignore_bits;
+
+ return true;
}
#ifdef CONFIG_X86_32
!(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
}
+static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
+{
+ /* The base must be 128-byte aligned and a legal physical address. */
+ return !kvm_vcpu_is_illegal_gpa(vcpu, base) && !(base & 0x7f);
+}
+
static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
{
u32 i;
* when guest state is loaded. This happens when guest transitions
* to/from long-mode by setting MSR_EFER.LMA.
*/
- if (!vmx->guest_msrs_ready) {
- vmx->guest_msrs_ready = true;
- for (i = 0; i < vmx->save_nmsrs; ++i)
- kvm_set_shared_msr(vmx->guest_msrs[i].index,
- vmx->guest_msrs[i].data,
- vmx->guest_msrs[i].mask);
+ if (!vmx->guest_uret_msrs_loaded) {
+ vmx->guest_uret_msrs_loaded = true;
+ for (i = 0; i < vmx->nr_active_uret_msrs; ++i)
+ kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot,
+ vmx->guest_uret_msrs[i].data,
+ vmx->guest_uret_msrs[i].mask);
}
#endif
load_fixmap_gdt(raw_smp_processor_id());
vmx->guest_state_loaded = false;
- vmx->guest_msrs_ready = false;
+ vmx->guest_uret_msrs_loaded = false;
}
#ifdef CONFIG_X86_64
}
#endif
-static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
-{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- struct pi_desc old, new;
- unsigned int dest;
-
- /*
- * In case of hot-plug or hot-unplug, we may have to undo
- * vmx_vcpu_pi_put even if there is no assigned device. And we
- * always keep PI.NDST up to date for simplicity: it makes the
- * code easier, and CPU migration is not a fast path.
- */
- if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
- return;
-
- /*
- * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
- * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
- * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
- * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
- * correctly.
- */
- if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
- pi_clear_sn(pi_desc);
- goto after_clear_sn;
- }
-
- /* The full case. */
- do {
- old.control = new.control = pi_desc->control;
-
- dest = cpu_physical_id(cpu);
-
- if (x2apic_enabled())
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
-
- new.sn = 0;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
-
-after_clear_sn:
-
- /*
- * Clear SN before reading the bitmap. The VT-d firmware
- * writes the bitmap and reads SN atomically (5.2.3 in the
- * spec), so it doesn't really have a memory barrier that
- * pairs with this, but we cannot do that and we need one.
- */
- smp_mb__after_atomic();
-
- if (!pi_is_pir_empty(pi_desc))
- pi_set_on(pi_desc);
-}
-
void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
struct loaded_vmcs *buddy)
{
vmx->host_debugctlmsr = get_debugctlmsr();
}
-static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
-{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
- return;
-
- /* Set SN when the vCPU is preempted */
- if (vcpu->preempted)
- pi_set_sn(pi_desc);
-}
-
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
vmx_vcpu_pi_put(vcpu);
static bool emulation_required(struct kvm_vcpu *vcpu)
{
- return emulate_invalid_guest_state && !guest_state_valid(vcpu);
+ return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
}
unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long old_rflags;
- if (enable_unrestricted_guest) {
+ if (is_unrestricted_guest(vcpu)) {
kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
vmx->rflags = rflags;
vmcs_writel(GUEST_RFLAGS, rflags);
return 0;
}
+static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
+{
+ return true;
+}
+
static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
unsigned long rip, orig_rip;
return 1;
}
-/*
- * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
- * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
- * indicates whether exit to userspace is needed.
- */
-int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
- struct x86_exception *e)
-{
- if (r == X86EMUL_PROPAGATE_FAULT) {
- kvm_inject_emulated_page_fault(vcpu, e);
- return 1;
- }
-
- /*
- * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
- * while handling a VMX instruction KVM could've handled the request
- * correctly by exiting to userspace and performing I/O but there
- * doesn't seem to be a real use-case behind such requests, just return
- * KVM_EXIT_INTERNAL_ERROR for now.
- */
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
-
- return 0;
-}
-
/*
* Recognizes a pending MTF VM-exit and records the nested state for later
* delivery.
vmx_clear_hlt(vcpu);
}
-/*
- * Swap MSR entry in host/guest MSR entry array.
- */
-static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
+static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr)
{
- struct shared_msr_entry tmp;
+ struct vmx_uret_msr tmp;
+ int from, to;
- tmp = vmx->guest_msrs[to];
- vmx->guest_msrs[to] = vmx->guest_msrs[from];
- vmx->guest_msrs[from] = tmp;
+ from = __vmx_find_uret_msr(vmx, msr);
+ if (from < 0)
+ return;
+ to = vmx->nr_active_uret_msrs++;
+
+ tmp = vmx->guest_uret_msrs[to];
+ vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from];
+ vmx->guest_uret_msrs[from] = tmp;
}
/*
*/
static void setup_msrs(struct vcpu_vmx *vmx)
{
- int save_nmsrs, index;
-
- save_nmsrs = 0;
+ vmx->guest_uret_msrs_loaded = false;
+ vmx->nr_active_uret_msrs = 0;
#ifdef CONFIG_X86_64
/*
* The SYSCALL MSRs are only needed on long mode guests, and only
* when EFER.SCE is set.
*/
if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
- index = __find_msr_index(vmx, MSR_STAR);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_LSTAR);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
+ vmx_setup_uret_msr(vmx, MSR_STAR);
+ vmx_setup_uret_msr(vmx, MSR_LSTAR);
+ vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK);
}
#endif
- index = __find_msr_index(vmx, MSR_EFER);
- if (index >= 0 && update_transition_efer(vmx, index))
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_TSC_AUX);
- if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
-
- vmx->save_nmsrs = save_nmsrs;
- vmx->guest_msrs_ready = false;
+ if (update_transition_efer(vmx))
+ vmx_setup_uret_msr(vmx, MSR_EFER);
+
+ if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
+ vmx_setup_uret_msr(vmx, MSR_TSC_AUX);
+
+ vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL);
if (cpu_has_vmx_msr_bitmap())
vmx_update_msr_bitmap(&vmx->vcpu);
static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct shared_msr_entry *msr;
+ struct vmx_uret_msr *msr;
u32 index;
switch (msr_info->index) {
if (!msr_info->host_initiated &&
!(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
return 1;
- goto find_shared_msr;
+ goto find_uret_msr;
case MSR_IA32_UMWAIT_CONTROL:
if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
return 1;
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
- goto find_shared_msr;
+ goto find_uret_msr;
default:
- find_shared_msr:
- msr = find_msr_entry(vmx, msr_info->index);
+ find_uret_msr:
+ msr = vmx_find_uret_msr(vmx, msr_info->index);
if (msr) {
msr_info->data = msr->data;
break;
static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct shared_msr_entry *msr;
+ struct vmx_uret_msr *msr;
int ret = 0;
u32 msr_index = msr_info->index;
u64 data = msr_info->data;
* in the merging. We update the vmcs01 here for L1 as well
* since it will end up touching the MSR anyway now.
*/
- vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
+ vmx_disable_intercept_for_msr(vcpu,
MSR_IA32_SPEC_CTRL,
MSR_TYPE_RW);
break;
return 1;
if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
return 1;
- goto find_shared_msr;
+ goto find_uret_msr;
case MSR_IA32_PRED_CMD:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
* vmcs02.msr_bitmap here since it gets completely overwritten
* in the merging.
*/
- vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
- MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W);
break;
case MSR_IA32_CR_PAT:
if (!kvm_pat_valid(data))
return 1;
vmcs_write64(GUEST_IA32_RTIT_CTL, data);
vmx->pt_desc.guest.ctl = data;
- pt_update_intercept_for_msr(vmx);
+ pt_update_intercept_for_msr(vcpu);
break;
case MSR_IA32_RTIT_STATUS:
if (!pt_can_write_msr(vmx))
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_single_range_output))
return 1;
- if (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK)
+ if (!pt_output_base_valid(vcpu, data))
return 1;
vmx->pt_desc.guest.output_base = data;
break;
/* Check reserved bit, higher 32 bits should be zero */
if ((data >> 32) != 0)
return 1;
- goto find_shared_msr;
+ goto find_uret_msr;
default:
- find_shared_msr:
- msr = find_msr_entry(vmx, msr_index);
+ find_uret_msr:
+ msr = vmx_find_uret_msr(vmx, msr_index);
if (msr)
- ret = vmx_set_guest_msr(vmx, msr, data);
+ ret = vmx_set_guest_uret_msr(vmx, msr, data);
else
ret = kvm_set_msr_common(vcpu, msr_info);
}
vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
break;
case VCPU_EXREG_CR3:
- if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
+ if (is_unrestricted_guest(vcpu) ||
+ (enable_ept && is_paging(vcpu)))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
break;
case VCPU_EXREG_CR4:
SECONDARY_EXEC_UNRESTRICTED_GUEST |
SECONDARY_EXEC_PAUSE_LOOP_EXITING |
SECONDARY_EXEC_DESC |
- SECONDARY_EXEC_RDTSCP |
+ SECONDARY_EXEC_ENABLE_RDTSCP |
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+ struct vmx_uret_msr *msr = vmx_find_uret_msr(vmx, MSR_EFER);
if (!msr)
return;
vpid_sync_context(to_vmx(vcpu)->vpid);
}
-static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
+void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
unsigned long hw_cr0;
hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
- if (enable_unrestricted_guest)
+ if (is_unrestricted_guest(vcpu))
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
else {
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
}
#endif
- if (enable_ept && !enable_unrestricted_guest)
+ if (enable_ept && !is_unrestricted_guest(vcpu))
ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
vmcs_writel(CR0_READ_SHADOW, cr0);
guest_cr3 = vcpu->arch.cr3;
else /* vmcs01.GUEST_CR3 is already up-to-date. */
update_guest_cr3 = false;
- ept_load_pdptrs(vcpu);
+ vmx_ept_load_pdptrs(vcpu);
} else {
guest_cr3 = pgd;
}
unsigned long hw_cr4;
hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
- if (enable_unrestricted_guest)
+ if (is_unrestricted_guest(vcpu))
hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
else if (vmx->rmode.vm86_active)
hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
vcpu->arch.cr4 = cr4;
kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
- if (!enable_unrestricted_guest) {
+ if (!is_unrestricted_guest(vcpu)) {
if (enable_ept) {
if (!is_paging(vcpu)) {
hw_cr4 &= ~X86_CR4_PAE;
* tree. Newer qemu binaries with that qemu fix would not need this
* kvm hack.
*/
- if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
+ if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
var->type |= 0x1; /* Accessed */
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
* not.
* We assume that registers are always usable
*/
-static bool guest_state_valid(struct kvm_vcpu *vcpu)
+bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
{
- if (enable_unrestricted_guest)
- return true;
-
/* real mode guest state checks */
if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
spin_unlock(&vmx_vpid_lock);
}
-static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
- u32 msr, int type)
+static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
{
int f = sizeof(unsigned long);
+ if (msr <= 0x1fff)
+ __clear_bit(msr, msr_bitmap + 0x000 / f);
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+ __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
+}
+
+static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
+{
+ int f = sizeof(unsigned long);
+
+ if (msr <= 0x1fff)
+ __clear_bit(msr, msr_bitmap + 0x800 / f);
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+ __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
+}
+
+static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
+{
+ int f = sizeof(unsigned long);
+
+ if (msr <= 0x1fff)
+ __set_bit(msr, msr_bitmap + 0x000 / f);
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+ __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
+}
+
+static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
+{
+ int f = sizeof(unsigned long);
+
+ if (msr <= 0x1fff)
+ __set_bit(msr, msr_bitmap + 0x800 / f);
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+ __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
+}
+
+static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+
if (!cpu_has_vmx_msr_bitmap())
return;
evmcs_touch_msr_bitmap();
/*
- * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
- * have the write-low and read-high bitmap offsets the wrong way round.
- * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
- */
- if (msr <= 0x1fff) {
- if (type & MSR_TYPE_R)
- /* read-low */
- __clear_bit(msr, msr_bitmap + 0x000 / f);
+ * Mark the desired intercept state in shadow bitmap, this is needed
+ * for resync when the MSR filters change.
+ */
+ if (is_valid_passthrough_msr(msr)) {
+ int idx = possible_passthrough_msr_slot(msr);
+
+ if (idx != -ENOENT) {
+ if (type & MSR_TYPE_R)
+ clear_bit(idx, vmx->shadow_msr_intercept.read);
+ if (type & MSR_TYPE_W)
+ clear_bit(idx, vmx->shadow_msr_intercept.write);
+ }
+ }
- if (type & MSR_TYPE_W)
- /* write-low */
- __clear_bit(msr, msr_bitmap + 0x800 / f);
+ if ((type & MSR_TYPE_R) &&
+ !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
+ vmx_set_msr_bitmap_read(msr_bitmap, msr);
+ type &= ~MSR_TYPE_R;
+ }
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- if (type & MSR_TYPE_R)
- /* read-high */
- __clear_bit(msr, msr_bitmap + 0x400 / f);
+ if ((type & MSR_TYPE_W) &&
+ !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
+ vmx_set_msr_bitmap_write(msr_bitmap, msr);
+ type &= ~MSR_TYPE_W;
+ }
- if (type & MSR_TYPE_W)
- /* write-high */
- __clear_bit(msr, msr_bitmap + 0xc00 / f);
+ if (type & MSR_TYPE_R)
+ vmx_clear_msr_bitmap_read(msr_bitmap, msr);
- }
+ if (type & MSR_TYPE_W)
+ vmx_clear_msr_bitmap_write(msr_bitmap, msr);
}
-static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
u32 msr, int type)
{
- int f = sizeof(unsigned long);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
if (!cpu_has_vmx_msr_bitmap())
return;
evmcs_touch_msr_bitmap();
/*
- * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
- * have the write-low and read-high bitmap offsets the wrong way round.
- * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
- */
- if (msr <= 0x1fff) {
- if (type & MSR_TYPE_R)
- /* read-low */
- __set_bit(msr, msr_bitmap + 0x000 / f);
-
- if (type & MSR_TYPE_W)
- /* write-low */
- __set_bit(msr, msr_bitmap + 0x800 / f);
-
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- if (type & MSR_TYPE_R)
- /* read-high */
- __set_bit(msr, msr_bitmap + 0x400 / f);
+ * Mark the desired intercept state in shadow bitmap, this is needed
+ * for resync when the MSR filter changes.
+ */
+ if (is_valid_passthrough_msr(msr)) {
+ int idx = possible_passthrough_msr_slot(msr);
+
+ if (idx != -ENOENT) {
+ if (type & MSR_TYPE_R)
+ set_bit(idx, vmx->shadow_msr_intercept.read);
+ if (type & MSR_TYPE_W)
+ set_bit(idx, vmx->shadow_msr_intercept.write);
+ }
+ }
- if (type & MSR_TYPE_W)
- /* write-high */
- __set_bit(msr, msr_bitmap + 0xc00 / f);
+ if (type & MSR_TYPE_R)
+ vmx_set_msr_bitmap_read(msr_bitmap, msr);
- }
+ if (type & MSR_TYPE_W)
+ vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
-static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
- u32 msr, int type, bool value)
+static __always_inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type, bool value)
{
if (value)
- vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
+ vmx_enable_intercept_for_msr(vcpu, msr, type);
else
- vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
+ vmx_disable_intercept_for_msr(vcpu, msr, type);
}
static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
return mode;
}
-static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
- u8 mode)
+static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode)
{
int msr;
- for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
- unsigned word = msr / BITS_PER_LONG;
- msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
- msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
+ for (msr = 0x800; msr <= 0x8ff; msr++) {
+ bool apicv = !!(mode & MSR_BITMAP_MODE_X2APIC_APICV);
+
+ vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, !apicv);
+ vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, true);
}
if (mode & MSR_BITMAP_MODE_X2APIC) {
* TPR reads and writes can be virtualized even if virtual interrupt
* delivery is not in use.
*/
- vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
- vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
- vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
- vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+ vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
}
}
}
void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
u8 mode = vmx_msr_bitmap_mode(vcpu);
u8 changed = mode ^ vmx->msr_bitmap_mode;
return;
if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
- vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
+ vmx_update_msr_bitmap_x2apic(vcpu, mode);
vmx->msr_bitmap_mode = mode;
}
-void pt_update_intercept_for_msr(struct vcpu_vmx *vmx)
+void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
{
- unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
u32 i;
- vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
- MSR_TYPE_RW, flag);
- vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
- MSR_TYPE_RW, flag);
- vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
- MSR_TYPE_RW, flag);
- vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
- MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
for (i = 0; i < vmx->pt_desc.addr_range; i++) {
- vmx_set_intercept_for_msr(msr_bitmap,
- MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
- vmx_set_intercept_for_msr(msr_bitmap,
- MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
}
}
return ((rvi & 0xf0) > (vppr & 0xf0));
}
+static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 i;
+
+ /*
+ * Set intercept permissions for all potentially passed through MSRs
+ * again. They will automatically get filtered through the MSR filter,
+ * so we are back in sync after this.
+ */
+ for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
+ u32 msr = vmx_possible_passthrough_msrs[i];
+ bool read = test_bit(i, vmx->shadow_msr_intercept.read);
+ bool write = test_bit(i, vmx->shadow_msr_intercept.write);
+
+ vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read);
+ vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write);
+ }
+
+ pt_update_intercept_for_msr(vcpu);
+ vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu));
+}
+
static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
bool nested)
{
return exec_control;
}
+/*
+ * Adjust a single secondary execution control bit to intercept/allow an
+ * instruction in the guest. This is usually done based on whether or not a
+ * feature has been exposed to the guest in order to correctly emulate faults.
+ */
+static inline void
+vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
+ u32 control, bool enabled, bool exiting)
+{
+ /*
+ * If the control is for an opt-in feature, clear the control if the
+ * feature is not exposed to the guest, i.e. not enabled. If the
+ * control is opt-out, i.e. an exiting control, clear the control if
+ * the feature _is_ exposed to the guest, i.e. exiting/interception is
+ * disabled for the associated instruction. Note, the caller is
+ * responsible presetting exec_control to set all supported bits.
+ */
+ if (enabled == exiting)
+ *exec_control &= ~control;
+
+ /*
+ * Update the nested MSR settings so that a nested VMM can/can't set
+ * controls for features that are/aren't exposed to the guest.
+ */
+ if (nested) {
+ if (enabled)
+ vmx->nested.msrs.secondary_ctls_high |= control;
+ else
+ vmx->nested.msrs.secondary_ctls_high &= ~control;
+ }
+}
+
+/*
+ * Wrapper macro for the common case of adjusting a secondary execution control
+ * based on a single guest CPUID bit, with a dedicated feature bit. This also
+ * verifies that the control is actually supported by KVM and hardware.
+ */
+#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
+({ \
+ bool __enabled; \
+ \
+ if (cpu_has_vmx_##name()) { \
+ __enabled = guest_cpuid_has(&(vmx)->vcpu, \
+ X86_FEATURE_##feat_name); \
+ vmx_adjust_secondary_exec_control(vmx, exec_control, \
+ SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
+ } \
+})
+
+/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
+#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
+ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
+
+#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
+ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
{
if (!enable_pml)
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
- if (vmx_xsaves_supported()) {
+ if (cpu_has_vmx_xsaves()) {
/* Exposing XSAVES only when XSAVE is exposed */
bool xsaves_enabled =
boot_cpu_has(X86_FEATURE_XSAVE) &&
vcpu->arch.xsaves_enabled = xsaves_enabled;
- if (!xsaves_enabled)
- exec_control &= ~SECONDARY_EXEC_XSAVES;
-
- if (nested) {
- if (xsaves_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_XSAVES;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_XSAVES;
- }
- }
-
- if (cpu_has_vmx_rdtscp()) {
- bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
- if (!rdtscp_enabled)
- exec_control &= ~SECONDARY_EXEC_RDTSCP;
-
- if (nested) {
- if (rdtscp_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_RDTSCP;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_RDTSCP;
- }
- }
-
- if (cpu_has_vmx_invpcid()) {
- /* Exposing INVPCID only when PCID is exposed */
- bool invpcid_enabled =
- guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
- guest_cpuid_has(vcpu, X86_FEATURE_PCID);
-
- if (!invpcid_enabled) {
- exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
- guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
- }
-
- if (nested) {
- if (invpcid_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_ENABLE_INVPCID;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_ENABLE_INVPCID;
- }
+ vmx_adjust_secondary_exec_control(vmx, &exec_control,
+ SECONDARY_EXEC_XSAVES,
+ xsaves_enabled, false);
}
- if (vmx_rdrand_supported()) {
- bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
- if (rdrand_enabled)
- exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
+ vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP);
- if (nested) {
- if (rdrand_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_RDRAND_EXITING;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_RDRAND_EXITING;
- }
- }
+ /*
+ * Expose INVPCID if and only if PCID is also exposed to the guest.
+ * INVPCID takes a #UD when it's disabled in the VMCS, but a #GP or #PF
+ * if CR4.PCIDE=0. Enumerating CPUID.INVPCID=1 would lead to incorrect
+ * behavior from the guest perspective (it would expect #GP or #PF).
+ */
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
+ guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
+ vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
- if (vmx_rdseed_supported()) {
- bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
- if (rdseed_enabled)
- exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
- if (nested) {
- if (rdseed_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_RDSEED_EXITING;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_RDSEED_EXITING;
- }
- }
-
- if (vmx_waitpkg_supported()) {
- bool waitpkg_enabled =
- guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG);
+ vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
+ vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
- if (!waitpkg_enabled)
- exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
-
- if (nested) {
- if (waitpkg_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
- }
- }
+ vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
+ ENABLE_USR_WAIT_PAUSE, false);
vmx->secondary_exec_control = exec_control;
}
if (vmx->vpid != 0)
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
- if (vmx_xsaves_supported())
+ if (cpu_has_vmx_xsaves())
vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
if (enable_pml) {
static int handle_invd(struct kvm_vcpu *vcpu)
{
- return kvm_emulate_instruction(vcpu, 0);
+ /* Treat an INVD instruction as a NOP and just skip it. */
+ return kvm_skip_emulated_instruction(vcpu);
}
static int handle_invlpg(struct kvm_vcpu *vcpu)
* would also use advanced VM-exit information for EPT violations to
* reconstruct the page fault error code.
*/
- if (unlikely(kvm_mmu_is_illegal_gpa(vcpu, gpa)))
+ if (unlikely(kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
return kvm_emulate_instruction(vcpu, 0);
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
}
}
-/*
- * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
- */
-static void wakeup_handler(void)
-{
- struct kvm_vcpu *vcpu;
- int cpu = smp_processor_id();
-
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
- list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
- blocked_vcpu_list) {
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
- if (pi_test_on(pi_desc) == 1)
- kvm_vcpu_kick(vcpu);
- }
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
-}
-
static void vmx_enable_tdp(void)
{
kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
{
u32 vmx_instruction_info;
unsigned long type;
- bool pcid_enabled;
gva_t gva;
- struct x86_exception e;
- unsigned i;
- unsigned long roots_to_free = 0;
struct {
u64 pcid;
u64 gla;
} operand;
- int r;
if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
kvm_queue_exception(vcpu, UD_VECTOR);
sizeof(operand), &gva))
return 1;
- r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
- if (r != X86EMUL_CONTINUE)
- return vmx_handle_memory_failure(vcpu, r, &e);
-
- if (operand.pcid >> 12 != 0) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
-
- pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
-
- switch (type) {
- case INVPCID_TYPE_INDIV_ADDR:
- if ((!pcid_enabled && (operand.pcid != 0)) ||
- is_noncanonical_address(operand.gla, vcpu)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
- return kvm_skip_emulated_instruction(vcpu);
-
- case INVPCID_TYPE_SINGLE_CTXT:
- if (!pcid_enabled && (operand.pcid != 0)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
-
- if (kvm_get_active_pcid(vcpu) == operand.pcid) {
- kvm_mmu_sync_roots(vcpu);
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
- }
-
- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
- == operand.pcid)
- roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
-
- kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
- /*
- * If neither the current cr3 nor any of the prev_roots use the
- * given PCID, then nothing needs to be done here because a
- * resync will happen anyway before switching to any other CR3.
- */
-
- return kvm_skip_emulated_instruction(vcpu);
-
- case INVPCID_TYPE_ALL_NON_GLOBAL:
- /*
- * Currently, KVM doesn't mark global entries in the shadow
- * page tables, so a non-global flush just degenerates to a
- * global flush. If needed, we could optimize this later by
- * keeping track of global entries in shadow page tables.
- */
-
- fallthrough;
- case INVPCID_TYPE_ALL_INCL_GLOBAL:
- kvm_mmu_unload(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
-
- default:
- BUG(); /* We have already checked above that type <= 3 */
- }
+ return kvm_handle_invpcid(vcpu, type, gva);
}
static int handle_pml_full(struct kvm_vcpu *vcpu)
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
-static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
+ u32 *intr_info, u32 *error_code)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
*info1 = vmx_get_exit_qual(vcpu);
- *info2 = vmx_get_intr_info(vcpu);
+ if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
+ *info2 = vmx->idt_vectoring_info;
+ *intr_info = vmx_get_intr_info(vcpu);
+ if (is_exception_with_error_code(*intr_info))
+ *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ else
+ *error_code = 0;
+ } else {
+ *info2 = 0;
+ *intr_info = 0;
+ *error_code = 0;
+ }
}
static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
exit_reason != EXIT_REASON_EPT_VIOLATION &&
exit_reason != EXIT_REASON_PML_FULL &&
+ exit_reason != EXIT_REASON_APIC_ACCESS &&
exit_reason != EXIT_REASON_TASK_SWITCH)) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
return max_irr;
}
-static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
-{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
- return pi_test_on(pi_desc) ||
- (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
-}
-
static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
if (!kvm_vcpu_apicv_active(vcpu))
memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
}
+void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
+
+static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
+{
+ unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
+ gate_desc *desc = (gate_desc *)host_idt_base + vector;
+
+ kvm_before_interrupt(vcpu);
+ vmx_do_interrupt_nmi_irqoff(gate_offset(desc));
+ kvm_after_interrupt(vcpu);
+}
+
static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
{
u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
/* if exit due to PF check for async PF */
- if (is_page_fault(intr_info)) {
+ if (is_page_fault(intr_info))
vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
/* Handle machine checks before interrupts are enabled */
- } else if (is_machine_check(intr_info)) {
+ else if (is_machine_check(intr_info))
kvm_machine_check();
/* We need to handle NMIs before interrupts are enabled */
- } else if (is_nmi(intr_info)) {
- kvm_before_interrupt(&vmx->vcpu);
- asm("int $2");
- kvm_after_interrupt(&vmx->vcpu);
- }
+ else if (is_nmi(intr_info))
+ handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info);
}
static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
{
- unsigned int vector;
- unsigned long entry;
-#ifdef CONFIG_X86_64
- unsigned long tmp;
-#endif
- gate_desc *desc;
u32 intr_info = vmx_get_intr_info(vcpu);
if (WARN_ONCE(!is_external_intr(intr_info),
"KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
- vector = intr_info & INTR_INFO_VECTOR_MASK;
- desc = (gate_desc *)host_idt_base + vector;
- entry = gate_offset(desc);
-
- kvm_before_interrupt(vcpu);
-
- asm volatile(
-#ifdef CONFIG_X86_64
- "mov %%rsp, %[sp]\n\t"
- "and $-16, %%rsp\n\t"
- "push %[ss]\n\t"
- "push %[sp]\n\t"
-#endif
- "pushf\n\t"
- "push %[cs]\n\t"
- CALL_NOSPEC
- :
-#ifdef CONFIG_X86_64
- [sp]"=&r"(tmp),
-#endif
- ASM_CALL_CONSTRAINT
- :
- [thunk_target]"r"(entry),
-#ifdef CONFIG_X86_64
- [ss]"i"(__KERNEL_DS),
-#endif
- [cs]"i"(__KERNEL_CS)
- );
-
- kvm_after_interrupt(vcpu);
+ handle_interrupt_nmi_irqoff(vcpu, intr_info);
}
-STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
if (enable_preemption_timer)
vmx_update_hv_timer(vcpu);
- if (lapic_in_kernel(vcpu) &&
- vcpu->arch.apic->lapic_timer.timer_advance_ns)
- kvm_wait_lapic_expire(vcpu);
+ kvm_wait_lapic_expire(vcpu);
/*
* If this vCPU has touched SPEC_CTRL, restore the guest's value if
goto free_vpid;
}
- BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != NR_SHARED_MSRS);
+ BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
- for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
- u32 index = vmx_msr_index[i];
+ for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
+ u32 index = vmx_uret_msrs_list[i];
u32 data_low, data_high;
- int j = vmx->nmsrs;
+ int j = vmx->nr_uret_msrs;
if (rdmsr_safe(index, &data_low, &data_high) < 0)
continue;
if (wrmsr_safe(index, data_low, data_high) < 0)
continue;
- vmx->guest_msrs[j].index = i;
- vmx->guest_msrs[j].data = 0;
+ vmx->guest_uret_msrs[j].slot = i;
+ vmx->guest_uret_msrs[j].data = 0;
switch (index) {
case MSR_IA32_TSX_CTRL:
/*
* let's avoid changing CPUID bits under the host
* kernel's feet.
*/
- vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
+ vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
break;
default:
- vmx->guest_msrs[j].mask = -1ull;
+ vmx->guest_uret_msrs[j].mask = -1ull;
break;
}
- ++vmx->nmsrs;
+ ++vmx->nr_uret_msrs;
}
err = alloc_loaded_vmcs(&vmx->vmcs01);
if (err < 0)
goto free_pml;
+ /* The MSR bitmap starts with all ones */
+ bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+ bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+
msr_bitmap = vmx->vmcs01.msr_bitmap;
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
if (kvm_cstate_in_guest(vcpu->kvm)) {
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
}
vmx->msr_bitmap_mode = 0;
}
if (nested)
- nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
- vmx_capability.ept);
+ memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
else
memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
update_intel_pt_cfg(vcpu);
if (boot_cpu_has(X86_FEATURE_RTM)) {
- struct shared_msr_entry *msr;
- msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL);
+ struct vmx_uret_msr *msr;
+ msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
if (msr) {
bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
- vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
+ vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
}
}
}
/* CPUID 0xD.1 */
supported_xss = 0;
- if (!vmx_xsaves_supported())
+ if (!cpu_has_vmx_xsaves())
kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
/* CPUID 0x80000001 */
if (!cpu_has_vmx_rdtscp())
kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
- if (vmx_waitpkg_supported())
+ if (cpu_has_vmx_waitpkg())
kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
}
* Because it is marked as EmulateOnUD, we need to intercept it here.
*/
case x86_intercept_rdtscp:
- if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
exception->vector = UD_VECTOR;
exception->error_code_valid = false;
return X86EMUL_PROPAGATE_FAULT;
kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
}
-static void __pi_post_block(struct kvm_vcpu *vcpu)
-{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- struct pi_desc old, new;
- unsigned int dest;
-
- do {
- old.control = new.control = pi_desc->control;
- WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
- "Wakeup handler not enabled while the VCPU is blocked\n");
-
- dest = cpu_physical_id(vcpu->cpu);
-
- if (x2apic_enabled())
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
-
- /* set 'NV' to 'notification vector' */
- new.nv = POSTED_INTR_VECTOR;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
-
- if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- list_del(&vcpu->blocked_vcpu_list);
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- vcpu->pre_pcpu = -1;
- }
-}
-
-/*
- * This routine does the following things for vCPU which is going
- * to be blocked if VT-d PI is enabled.
- * - Store the vCPU to the wakeup list, so when interrupts happen
- * we can find the right vCPU to wake up.
- * - Change the Posted-interrupt descriptor as below:
- * 'NDST' <-- vcpu->pre_pcpu
- * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
- * - If 'ON' is set during this process, which means at least one
- * interrupt is posted for this vCPU, we cannot block it, in
- * this case, return 1, otherwise, return 0.
- *
- */
-static int pi_pre_block(struct kvm_vcpu *vcpu)
-{
- unsigned int dest;
- struct pi_desc old, new;
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
- return 0;
-
- WARN_ON(irqs_disabled());
- local_irq_disable();
- if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
- vcpu->pre_pcpu = vcpu->cpu;
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- list_add_tail(&vcpu->blocked_vcpu_list,
- &per_cpu(blocked_vcpu_on_cpu,
- vcpu->pre_pcpu));
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- }
-
- do {
- old.control = new.control = pi_desc->control;
-
- WARN((pi_desc->sn == 1),
- "Warning: SN field of posted-interrupts "
- "is set before blocking\n");
-
- /*
- * Since vCPU can be preempted during this process,
- * vcpu->cpu could be different with pre_pcpu, we
- * need to set pre_pcpu as the destination of wakeup
- * notification event, then we can find the right vCPU
- * to wakeup in wakeup handler if interrupts happen
- * when the vCPU is in blocked state.
- */
- dest = cpu_physical_id(vcpu->pre_pcpu);
-
- if (x2apic_enabled())
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
-
- /* set 'NV' to 'wakeup vector' */
- new.nv = POSTED_INTR_WAKEUP_VECTOR;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
-
- /* We should not block the vCPU if an interrupt is posted for it. */
- if (pi_test_on(pi_desc) == 1)
- __pi_post_block(vcpu);
-
- local_irq_enable();
- return (vcpu->pre_pcpu == -1);
-}
-
static int vmx_pre_block(struct kvm_vcpu *vcpu)
{
if (pi_pre_block(vcpu))
return 0;
}
-static void pi_post_block(struct kvm_vcpu *vcpu)
-{
- if (vcpu->pre_pcpu == -1)
- return;
-
- WARN_ON(irqs_disabled());
- local_irq_disable();
- __pi_post_block(vcpu);
- local_irq_enable();
-}
-
static void vmx_post_block(struct kvm_vcpu *vcpu)
{
if (kvm_x86_ops.set_hv_timer)
pi_post_block(vcpu);
}
-/*
- * vmx_update_pi_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
-{
- struct kvm_kernel_irq_routing_entry *e;
- struct kvm_irq_routing_table *irq_rt;
- struct kvm_lapic_irq irq;
- struct kvm_vcpu *vcpu;
- struct vcpu_data vcpu_info;
- int idx, ret = 0;
-
- if (!kvm_arch_has_assigned_device(kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(kvm->vcpus[0]))
- return 0;
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
- if (guest_irq >= irq_rt->nr_rt_entries ||
- hlist_empty(&irq_rt->map[guest_irq])) {
- pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
- guest_irq, irq_rt->nr_rt_entries);
- goto out;
- }
-
- hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
- if (e->type != KVM_IRQ_ROUTING_MSI)
- continue;
- /*
- * VT-d PI cannot support posting multicast/broadcast
- * interrupts to a vCPU, we still use interrupt remapping
- * for these kind of interrupts.
- *
- * For lowest-priority interrupts, we only support
- * those with single CPU as the destination, e.g. user
- * configures the interrupts via /proc/irq or uses
- * irqbalance to make the interrupts single-CPU.
- *
- * We will support full lowest-priority interrupt later.
- *
- * In addition, we can only inject generic interrupts using
- * the PI mechanism, refuse to route others through it.
- */
-
- kvm_set_msi_irq(kvm, e, &irq);
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
- !kvm_irq_is_postable(&irq)) {
- /*
- * Make sure the IRTE is in remapped mode if
- * we don't handle it in posted mode.
- */
- ret = irq_set_vcpu_affinity(host_irq, NULL);
- if (ret < 0) {
- printk(KERN_INFO
- "failed to back to remapped mode, irq: %u\n",
- host_irq);
- goto out;
- }
-
- continue;
- }
-
- vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
- vcpu_info.vector = irq.vector;
-
- trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
- vcpu_info.vector, vcpu_info.pi_desc_addr, set);
-
- if (set)
- ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
- else
- ret = irq_set_vcpu_affinity(host_irq, NULL);
-
- if (ret < 0) {
- printk(KERN_INFO "%s: failed to update PI IRTE\n",
- __func__);
- goto out;
- }
- }
-
- ret = 0;
-out:
- srcu_read_unlock(&kvm->irq_srcu, idx);
- return ret;
-}
-
static void vmx_setup_mce(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
/* RSM will cause a vmexit anyway. */
}
-static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
-{
- return false;
-}
-
static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
return to_vmx(vcpu)->nested.vmxon;
.guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
.sync_pir_to_irr = vmx_sync_pir_to_irr,
.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
- .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt,
+ .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
.set_tss_addr = vmx_set_tss_addr,
.set_identity_map_addr = vmx_set_identity_map_addr,
.pmu_ops = &intel_pmu_ops,
.nested_ops = &vmx_nested_ops,
- .update_pi_irte = vmx_update_pi_irte,
+ .update_pi_irte = pi_update_irte,
#ifdef CONFIG_X86_64
.set_hv_timer = vmx_set_hv_timer,
.pre_leave_smm = vmx_pre_leave_smm,
.enable_smi_window = enable_smi_window,
- .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
+ .can_emulate_instruction = vmx_can_emulate_instruction,
.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
.migrate_timers = vmx_migrate_timers,
+
+ .msr_filter_changed = vmx_msr_filter_changed,
};
static __init int hardware_setup(void)
store_idt(&dt);
host_idt_base = dt.address;
- for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
- kvm_define_shared_msr(i, vmx_msr_index[i]);
+ for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
+ kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]);
if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
return -EIO;
vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
}
- kvm_set_posted_intr_wakeup_handler(wakeup_handler);
+ kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
kvm_mce_cap_supported |= MCG_LMCE_P;
for_each_possible_cpu(cpu) {
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
- INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
- spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+
+ pi_init(cpu);
}
#ifdef CONFIG_KEXEC_CORE
#include "capabilities.h"
#include "kvm_cache_regs.h"
-#include "ops.h"
+#include "posted_intr.h"
#include "vmcs.h"
+#include "vmx_ops.h"
#include "cpuid.h"
extern const u32 vmx_msr_index[];
#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
#ifdef CONFIG_X86_64
-#define NR_SHARED_MSRS 7
+#define MAX_NR_USER_RETURN_MSRS 7
#else
-#define NR_SHARED_MSRS 4
+#define MAX_NR_USER_RETURN_MSRS 4
#endif
-#define NR_LOADSTORE_MSRS 8
+#define MAX_NR_LOADSTORE_MSRS 8
struct vmx_msrs {
unsigned int nr;
- struct vmx_msr_entry val[NR_LOADSTORE_MSRS];
+ struct vmx_msr_entry val[MAX_NR_LOADSTORE_MSRS];
};
-struct shared_msr_entry {
- unsigned index;
+struct vmx_uret_msr {
+ unsigned int slot; /* The MSR's slot in kvm_user_return_msrs. */
u64 data;
u64 mask;
};
SEG_FIELD_NR = 4
};
-/* Posted-Interrupt Descriptor */
-struct pi_desc {
- u32 pir[8]; /* Posted interrupt requested */
- union {
- struct {
- /* bit 256 - Outstanding Notification */
- u16 on : 1,
- /* bit 257 - Suppress Notification */
- sn : 1,
- /* bit 271:258 - Reserved */
- rsvd_1 : 14;
- /* bit 279:272 - Notification Vector */
- u8 nv;
- /* bit 287:280 - Reserved */
- u8 rsvd_2;
- /* bit 319:288 - Notification Destination */
- u32 ndst;
- };
- u64 control;
- };
- u32 rsvd[6];
-} __aligned(64);
-
#define RTIT_ADDR_RANGE 4
struct pt_ctx {
u32 idt_vectoring_info;
ulong rflags;
- struct shared_msr_entry guest_msrs[NR_SHARED_MSRS];
- int nmsrs;
- int save_nmsrs;
- bool guest_msrs_ready;
+ struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS];
+ int nr_uret_msrs;
+ int nr_active_uret_msrs;
+ bool guest_uret_msrs_loaded;
#ifdef CONFIG_X86_64
u64 msr_host_kernel_gs_base;
u64 msr_guest_kernel_gs_base;
u64 ept_pointer;
struct pt_desc pt_desc;
+
+ /* Save desired MSR intercept (read: pass-through) state */
+#define MAX_POSSIBLE_PASSTHROUGH_MSRS 13
+ struct {
+ DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+ DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+ } shadow_msr_intercept;
};
enum ept_pointers_status {
void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
int root_level);
+
void update_exception_bitmap(struct kvm_vcpu *vcpu);
void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
-struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr);
-void pt_update_intercept_for_msr(struct vcpu_vmx *vmx);
+struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr);
+void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu);
void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
-int vmx_find_msr_index(struct vmx_msrs *m, u32 msr);
-int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
- struct x86_exception *e);
-
-#define POSTED_INTR_ON 0
-#define POSTED_INTR_SN 1
-
-static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
-{
- return test_and_set_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
-{
- return test_and_clear_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
-{
- return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
-}
-
-static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
-{
- return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
-}
-
-static inline void pi_set_sn(struct pi_desc *pi_desc)
-{
- set_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_set_on(struct pi_desc *pi_desc)
-{
- set_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_clear_on(struct pi_desc *pi_desc)
-{
- clear_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_clear_sn(struct pi_desc *pi_desc)
-{
- clear_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline int pi_test_on(struct pi_desc *pi_desc)
-{
- return test_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline int pi_test_sn(struct pi_desc *pi_desc)
-{
- return test_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
+int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
+void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
static inline u8 vmx_get_rvi(void)
{
return container_of(vcpu, struct vcpu_vmx, vcpu);
}
-static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
-{
- return &(to_vmx(vcpu)->pi_desc);
-}
-
static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
return !enable_ept || cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits;
}
+static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu)
+{
+ return enable_unrestricted_guest && (!is_guest_mode(vcpu) ||
+ (secondary_exec_controls_get(to_vmx(vcpu)) &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST));
+}
+
+bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu);
+static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu)
+{
+ return is_unrestricted_guest(vcpu) || __vmx_guest_state_valid(vcpu);
+}
+
void dump_vmcs(void);
#endif /* __KVM_X86_VMX_H */
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_INSN_H
+#define __KVM_X86_VMX_INSN_H
+
+#include <linux/nospec.h>
+
+#include <asm/kvm_host.h>
+#include <asm/vmx.h>
+
+#include "evmcs.h"
+#include "vmcs.h"
+
+#define __ex(x) __kvm_handle_fault_on_reboot(x)
+
+asmlinkage void vmread_error(unsigned long field, bool fault);
+__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field,
+ bool fault);
+void vmwrite_error(unsigned long field, unsigned long value);
+void vmclear_error(struct vmcs *vmcs, u64 phys_addr);
+void vmptrld_error(struct vmcs *vmcs, u64 phys_addr);
+void invvpid_error(unsigned long ext, u16 vpid, gva_t gva);
+void invept_error(unsigned long ext, u64 eptp, gpa_t gpa);
+
+static __always_inline void vmcs_check16(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+ "16-bit accessor invalid for 64-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "16-bit accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "16-bit accessor invalid for 32-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "16-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_check32(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "32-bit accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "32-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_check64(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "64-bit accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "64-bit accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "64-bit accessor invalid for 32-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "64-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_checkl(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "Natural width accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+ "Natural width accessor invalid for 64-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "Natural width accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "Natural width accessor invalid for 32-bit field");
+}
+
+static __always_inline unsigned long __vmcs_readl(unsigned long field)
+{
+ unsigned long value;
+
+ asm volatile("1: vmread %2, %1\n\t"
+ ".byte 0x3e\n\t" /* branch taken hint */
+ "ja 3f\n\t"
+
+ /*
+ * VMREAD failed. Push '0' for @fault, push the failing
+ * @field, and bounce through the trampoline to preserve
+ * volatile registers.
+ */
+ "push $0\n\t"
+ "push %2\n\t"
+ "2:call vmread_error_trampoline\n\t"
+
+ /*
+ * Unwind the stack. Note, the trampoline zeros out the
+ * memory for @fault so that the result is '0' on error.
+ */
+ "pop %2\n\t"
+ "pop %1\n\t"
+ "3:\n\t"
+
+ /* VMREAD faulted. As above, except push '1' for @fault. */
+ ".pushsection .fixup, \"ax\"\n\t"
+ "4: push $1\n\t"
+ "push %2\n\t"
+ "jmp 2b\n\t"
+ ".popsection\n\t"
+ _ASM_EXTABLE(1b, 4b)
+ : ASM_CALL_CONSTRAINT, "=r"(value) : "r"(field) : "cc");
+ return value;
+}
+
+static __always_inline u16 vmcs_read16(unsigned long field)
+{
+ vmcs_check16(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read16(field);
+ return __vmcs_readl(field);
+}
+
+static __always_inline u32 vmcs_read32(unsigned long field)
+{
+ vmcs_check32(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read32(field);
+ return __vmcs_readl(field);
+}
+
+static __always_inline u64 vmcs_read64(unsigned long field)
+{
+ vmcs_check64(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read64(field);
+#ifdef CONFIG_X86_64
+ return __vmcs_readl(field);
+#else
+ return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
+#endif
+}
+
+static __always_inline unsigned long vmcs_readl(unsigned long field)
+{
+ vmcs_checkl(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read64(field);
+ return __vmcs_readl(field);
+}
+
+#define vmx_asm1(insn, op1, error_args...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \
+ ".byte 0x2e\n\t" /* branch not taken hint */ \
+ "jna %l[error]\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ : : op1 : "cc" : error, fault); \
+ return; \
+error: \
+ instrumentation_begin(); \
+ insn##_error(error_args); \
+ instrumentation_end(); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+#define vmx_asm2(insn, op1, op2, error_args...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \
+ ".byte 0x2e\n\t" /* branch not taken hint */ \
+ "jna %l[error]\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ : : op1, op2 : "cc" : error, fault); \
+ return; \
+error: \
+ instrumentation_begin(); \
+ insn##_error(error_args); \
+ instrumentation_end(); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
+{
+ vmx_asm2(vmwrite, "r"(field), "rm"(value), field, value);
+}
+
+static __always_inline void vmcs_write16(unsigned long field, u16 value)
+{
+ vmcs_check16(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write16(field, value);
+
+ __vmcs_writel(field, value);
+}
+
+static __always_inline void vmcs_write32(unsigned long field, u32 value)
+{
+ vmcs_check32(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write32(field, value);
+
+ __vmcs_writel(field, value);
+}
+
+static __always_inline void vmcs_write64(unsigned long field, u64 value)
+{
+ vmcs_check64(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write64(field, value);
+
+ __vmcs_writel(field, value);
+#ifndef CONFIG_X86_64
+ __vmcs_writel(field+1, value >> 32);
+#endif
+}
+
+static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
+{
+ vmcs_checkl(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write64(field, value);
+
+ __vmcs_writel(field, value);
+}
+
+static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
+ "vmcs_clear_bits does not support 64-bit fields");
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write32(field, evmcs_read32(field) & ~mask);
+
+ __vmcs_writel(field, __vmcs_readl(field) & ~mask);
+}
+
+static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
+ "vmcs_set_bits does not support 64-bit fields");
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write32(field, evmcs_read32(field) | mask);
+
+ __vmcs_writel(field, __vmcs_readl(field) | mask);
+}
+
+static inline void vmcs_clear(struct vmcs *vmcs)
+{
+ u64 phys_addr = __pa(vmcs);
+
+ vmx_asm1(vmclear, "m"(phys_addr), vmcs, phys_addr);
+}
+
+static inline void vmcs_load(struct vmcs *vmcs)
+{
+ u64 phys_addr = __pa(vmcs);
+
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_load(phys_addr);
+
+ vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr);
+}
+
+static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
+{
+ struct {
+ u64 vpid : 16;
+ u64 rsvd : 48;
+ u64 gva;
+ } operand = { vpid, 0, gva };
+
+ vmx_asm2(invvpid, "r"(ext), "m"(operand), ext, vpid, gva);
+}
+
+static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
+{
+ struct {
+ u64 eptp, gpa;
+ } operand = {eptp, gpa};
+
+ vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa);
+}
+
+static inline void vpid_sync_vcpu_single(int vpid)
+{
+ if (vpid == 0)
+ return;
+
+ __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
+}
+
+static inline void vpid_sync_vcpu_global(void)
+{
+ __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
+}
+
+static inline void vpid_sync_context(int vpid)
+{
+ if (cpu_has_vmx_invvpid_single())
+ vpid_sync_vcpu_single(vpid);
+ else if (vpid != 0)
+ vpid_sync_vcpu_global();
+}
+
+static inline void vpid_sync_vcpu_addr(int vpid, gva_t addr)
+{
+ if (vpid == 0)
+ return;
+
+ if (cpu_has_vmx_invvpid_individual_addr())
+ __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
+ else
+ vpid_sync_context(vpid);
+}
+
+static inline void ept_sync_global(void)
+{
+ __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+}
+
+static inline void ept_sync_context(u64 eptp)
+{
+ if (cpu_has_vmx_invept_context())
+ __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
+ else
+ ept_sync_global();
+}
+
+#endif /* __KVM_X86_VMX_INSN_H */
#include <asm/irq_remapping.h>
#include <asm/mshyperv.h>
#include <asm/hypervisor.h>
+#include <asm/tlbflush.h>
#include <asm/intel_pt.h>
#include <asm/emulate_prefix.h>
#include <clocksource/hyperv_timer.h>
int __read_mostly pi_inject_timer = -1;
module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
-#define KVM_NR_SHARED_MSRS 16
+/*
+ * Restoring the host value for MSRs that are only consumed when running in
+ * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
+ * returns to userspace, i.e. the kernel can run with the guest's value.
+ */
+#define KVM_MAX_NR_USER_RETURN_MSRS 16
-struct kvm_shared_msrs_global {
+struct kvm_user_return_msrs_global {
int nr;
- u32 msrs[KVM_NR_SHARED_MSRS];
+ u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
};
-struct kvm_shared_msrs {
+struct kvm_user_return_msrs {
struct user_return_notifier urn;
bool registered;
- struct kvm_shared_msr_values {
+ struct kvm_user_return_msr_values {
u64 host;
u64 curr;
- } values[KVM_NR_SHARED_MSRS];
+ } values[KVM_MAX_NR_USER_RETURN_MSRS];
};
-static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
-static struct kvm_shared_msrs __percpu *shared_msrs;
+static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
+static struct kvm_user_return_msrs __percpu *user_return_msrs;
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
} else {
vcpu_debug_ratelimited(vcpu, "unhandled %s: 0x%x data 0x%llx\n",
op, msr, data);
- return 1;
+ return -ENOENT;
}
}
static void kvm_on_user_return(struct user_return_notifier *urn)
{
unsigned slot;
- struct kvm_shared_msrs *locals
- = container_of(urn, struct kvm_shared_msrs, urn);
- struct kvm_shared_msr_values *values;
+ struct kvm_user_return_msrs *msrs
+ = container_of(urn, struct kvm_user_return_msrs, urn);
+ struct kvm_user_return_msr_values *values;
unsigned long flags;
/*
* interrupted and executed through kvm_arch_hardware_disable()
*/
local_irq_save(flags);
- if (locals->registered) {
- locals->registered = false;
+ if (msrs->registered) {
+ msrs->registered = false;
user_return_notifier_unregister(urn);
}
local_irq_restore(flags);
- for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
- values = &locals->values[slot];
+ for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
+ values = &msrs->values[slot];
if (values->host != values->curr) {
- wrmsrl(shared_msrs_global.msrs[slot], values->host);
+ wrmsrl(user_return_msrs_global.msrs[slot], values->host);
values->curr = values->host;
}
}
}
-void kvm_define_shared_msr(unsigned slot, u32 msr)
+void kvm_define_user_return_msr(unsigned slot, u32 msr)
{
- BUG_ON(slot >= KVM_NR_SHARED_MSRS);
- shared_msrs_global.msrs[slot] = msr;
- if (slot >= shared_msrs_global.nr)
- shared_msrs_global.nr = slot + 1;
+ BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
+ user_return_msrs_global.msrs[slot] = msr;
+ if (slot >= user_return_msrs_global.nr)
+ user_return_msrs_global.nr = slot + 1;
}
-EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
+EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
-static void kvm_shared_msr_cpu_online(void)
+static void kvm_user_return_msr_cpu_online(void)
{
unsigned int cpu = smp_processor_id();
- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
u64 value;
int i;
- for (i = 0; i < shared_msrs_global.nr; ++i) {
- rdmsrl_safe(shared_msrs_global.msrs[i], &value);
- smsr->values[i].host = value;
- smsr->values[i].curr = value;
+ for (i = 0; i < user_return_msrs_global.nr; ++i) {
+ rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
+ msrs->values[i].host = value;
+ msrs->values[i].curr = value;
}
}
-int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
+int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
{
unsigned int cpu = smp_processor_id();
- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
int err;
- value = (value & mask) | (smsr->values[slot].host & ~mask);
- if (value == smsr->values[slot].curr)
+ value = (value & mask) | (msrs->values[slot].host & ~mask);
+ if (value == msrs->values[slot].curr)
return 0;
- err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
+ err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
if (err)
return 1;
- smsr->values[slot].curr = value;
- if (!smsr->registered) {
- smsr->urn.on_user_return = kvm_on_user_return;
- user_return_notifier_register(&smsr->urn);
- smsr->registered = true;
+ msrs->values[slot].curr = value;
+ if (!msrs->registered) {
+ msrs->urn.on_user_return = kvm_on_user_return;
+ user_return_notifier_register(&msrs->urn);
+ msrs->registered = true;
}
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
+EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
static void drop_user_return_notifiers(void)
{
unsigned int cpu = smp_processor_id();
- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
- if (smsr->registered)
- kvm_on_user_return(&smsr->urn);
+ if (msrs->registered)
+ kvm_on_user_return(&msrs->urn);
}
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
+bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
+ u32 count = kvm->arch.msr_filter.count;
+ u32 i;
+ bool r = kvm->arch.msr_filter.default_allow;
+ int idx;
+
+ /* MSR filtering not set up, allow everything */
+ if (!count)
+ return true;
+
+ /* Prevent collision with set_msr_filter */
+ idx = srcu_read_lock(&kvm->srcu);
+
+ for (i = 0; i < count; i++) {
+ u32 start = ranges[i].base;
+ u32 end = start + ranges[i].nmsrs;
+ u32 flags = ranges[i].flags;
+ unsigned long *bitmap = ranges[i].bitmap;
+
+ if ((index >= start) && (index < end) && (flags & type)) {
+ r = !!test_bit(index - start, bitmap);
+ break;
+ }
+ }
+
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_msr_allowed);
+
/*
* Write @data into the MSR specified by @index. Select MSR specific fault
* checks are bypassed if @host_initiated is %true.
{
struct msr_data msr;
+ if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
+ return -EPERM;
+
switch (index) {
case MSR_FS_BASE:
case MSR_GS_BASE:
struct msr_data msr;
int ret;
+ if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
+ return -EPERM;
+
msr.index = index;
msr.host_initiated = host_initiated;
}
EXPORT_SYMBOL_GPL(kvm_set_msr);
+static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read)
+{
+ if (vcpu->run->msr.error) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ } else if (is_read) {
+ kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
+ kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
+ }
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
+{
+ return complete_emulated_msr(vcpu, true);
+}
+
+static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
+{
+ return complete_emulated_msr(vcpu, false);
+}
+
+static u64 kvm_msr_reason(int r)
+{
+ switch (r) {
+ case -ENOENT:
+ return KVM_MSR_EXIT_REASON_UNKNOWN;
+ case -EPERM:
+ return KVM_MSR_EXIT_REASON_FILTER;
+ default:
+ return KVM_MSR_EXIT_REASON_INVAL;
+ }
+}
+
+static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
+ u32 exit_reason, u64 data,
+ int (*completion)(struct kvm_vcpu *vcpu),
+ int r)
+{
+ u64 msr_reason = kvm_msr_reason(r);
+
+ /* Check if the user wanted to know about this MSR fault */
+ if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
+ return 0;
+
+ vcpu->run->exit_reason = exit_reason;
+ vcpu->run->msr.error = 0;
+ memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
+ vcpu->run->msr.reason = msr_reason;
+ vcpu->run->msr.index = index;
+ vcpu->run->msr.data = data;
+ vcpu->arch.complete_userspace_io = completion;
+
+ return 1;
+}
+
+static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
+{
+ return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
+ complete_emulated_rdmsr, r);
+}
+
+static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
+{
+ return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
+ complete_emulated_wrmsr, r);
+}
+
int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
{
u32 ecx = kvm_rcx_read(vcpu);
u64 data;
+ int r;
- if (kvm_get_msr(vcpu, ecx, &data)) {
+ r = kvm_get_msr(vcpu, ecx, &data);
+
+ /* MSR read failed? See if we should ask user space */
+ if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
+ /* Bounce to user space */
+ return 0;
+ }
+
+ /* MSR read failed? Inject a #GP */
+ if (r) {
trace_kvm_msr_read_ex(ecx);
kvm_inject_gp(vcpu, 0);
return 1;
{
u32 ecx = kvm_rcx_read(vcpu);
u64 data = kvm_read_edx_eax(vcpu);
+ int r;
+
+ r = kvm_set_msr(vcpu, ecx, data);
+
+ /* MSR write failed? See if we should ask user space */
+ if (r && kvm_set_msr_user_space(vcpu, ecx, data, r)) {
+ /* Bounce to user space */
+ return 0;
+ }
- if (kvm_set_msr(vcpu, ecx, data)) {
+ /* MSR write failed? Inject a #GP */
+ if (r) {
trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(vcpu, 0);
return 1;
}
#endif
-void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
-{
- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
- kvm_vcpu_kick(vcpu);
-}
-
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
int version;
#endif
}
-static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
-{
- u64 curr_offset = vcpu->arch.l1_tsc_offset;
- vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
-}
-
/*
* Multiply tsc by a fixed point number represented by ratio.
*
return check_tsc_unstable();
}
-void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
+static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed;
unsigned long flags;
bool matched;
bool already_matched;
- u64 data = msr->data;
bool synchronizing = false;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
elapsed = ns - kvm->arch.last_tsc_nsec;
if (vcpu->arch.virtual_tsc_khz) {
- if (data == 0 && msr->host_initiated) {
+ if (data == 0) {
/*
* detection of vcpu initialization -- need to sync
* with other vCPUs. This particularly helps to keep
vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
- if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
- update_ia32_tsc_adjust_msr(vcpu, offset);
-
kvm_vcpu_write_tsc_offset(vcpu, offset);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
}
-EXPORT_SYMBOL_GPL(kvm_write_tsc);
-
static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
s64 adjustment)
{
return 1;
if (!lapic_in_kernel(vcpu))
- return 1;
+ return data ? 1 : 0;
vcpu->arch.apf.msr_en_val = data;
vcpu->arch.msr_ia32_power_ctl = data;
break;
case MSR_IA32_TSC:
- kvm_write_tsc(vcpu, msr_info);
+ if (msr_info->host_initiated) {
+ kvm_synchronize_tsc(vcpu, data);
+ } else {
+ u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
+ adjust_tsc_offset_guest(vcpu, adj);
+ vcpu->arch.ia32_tsc_adjust_msr += adj;
+ }
break;
case MSR_IA32_XSS:
if (!msr_info->host_initiated &&
case MSR_IA32_POWER_CTL:
msr_info->data = vcpu->arch.msr_ia32_power_ctl;
break;
- case MSR_IA32_TSC:
- msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
+ case MSR_IA32_TSC: {
+ /*
+ * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
+ * even when not intercepted. AMD manual doesn't explicitly
+ * state this but appears to behave the same.
+ *
+ * Unconditionally return L1's TSC offset on userspace reads
+ * so that userspace reads and writes always operate on L1's
+ * offset, e.g. to ensure deterministic behavior for migration.
+ */
+ u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
+ vcpu->arch.tsc_offset;
+
+ msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
break;
+ }
case MSR_MTRRcap:
case 0x200 ... 0x2ff:
return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
case KVM_CAP_EXCEPTION_PAYLOAD:
case KVM_CAP_SET_GUEST_DEBUG:
case KVM_CAP_LAST_CPU:
+ case KVM_CAP_X86_USER_SPACE_MSR:
+ case KVM_CAP_X86_MSR_FILTER:
r = 1;
break;
case KVM_CAP_SYNC_REGS:
kvm->arch.exception_payload_enabled = cap->args[0];
r = 0;
break;
+ case KVM_CAP_X86_USER_SPACE_MSR:
+ kvm->arch.user_space_msr_mask = cap->args[0];
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
return r;
}
+static void kvm_clear_msr_filter(struct kvm *kvm)
+{
+ u32 i;
+ u32 count = kvm->arch.msr_filter.count;
+ struct msr_bitmap_range ranges[16];
+
+ mutex_lock(&kvm->lock);
+ kvm->arch.msr_filter.count = 0;
+ memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0]));
+ mutex_unlock(&kvm->lock);
+ synchronize_srcu(&kvm->srcu);
+
+ for (i = 0; i < count; i++)
+ kfree(ranges[i].bitmap);
+}
+
+static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range)
+{
+ struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
+ struct msr_bitmap_range range;
+ unsigned long *bitmap = NULL;
+ size_t bitmap_size;
+ int r;
+
+ if (!user_range->nmsrs)
+ return 0;
+
+ bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
+ if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
+ return -EINVAL;
+
+ bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
+ if (IS_ERR(bitmap))
+ return PTR_ERR(bitmap);
+
+ range = (struct msr_bitmap_range) {
+ .flags = user_range->flags,
+ .base = user_range->base,
+ .nmsrs = user_range->nmsrs,
+ .bitmap = bitmap,
+ };
+
+ if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) {
+ r = -EINVAL;
+ goto err;
+ }
+
+ if (!range.flags) {
+ r = -EINVAL;
+ goto err;
+ }
+
+ /* Everything ok, add this range identifier to our global pool */
+ ranges[kvm->arch.msr_filter.count] = range;
+ /* Make sure we filled the array before we tell anyone to walk it */
+ smp_wmb();
+ kvm->arch.msr_filter.count++;
+
+ return 0;
+err:
+ kfree(bitmap);
+ return r;
+}
+
+static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_msr_filter __user *user_msr_filter = argp;
+ struct kvm_msr_filter filter;
+ bool default_allow;
+ int r = 0;
+ u32 i;
+
+ if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
+ return -EFAULT;
+
+ kvm_clear_msr_filter(kvm);
+
+ default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY);
+ kvm->arch.msr_filter.default_allow = default_allow;
+
+ /*
+ * Protect from concurrent calls to this function that could trigger
+ * a TOCTOU violation on kvm->arch.msr_filter.count.
+ */
+ mutex_lock(&kvm->lock);
+ for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
+ r = kvm_add_msr_filter(kvm, &filter.ranges[i]);
+ if (r)
+ break;
+ }
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
+ mutex_unlock(&kvm->lock);
+
+ return r;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
case KVM_SET_PMU_EVENT_FILTER:
r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
break;
+ case KVM_X86_SET_MSR_FILTER:
+ r = kvm_vm_ioctl_set_msr_filter(kvm, argp);
+ break;
default:
r = -ENOTTY;
}
char sig[5]; /* ud2; .ascii "kvm" */
struct x86_exception e;
+ if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0)))
+ return 1;
+
if (force_emulation_prefix &&
kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
sig, sizeof(sig), &e) == 0 &&
static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 *pdata)
{
- return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int r;
+
+ r = kvm_get_msr(vcpu, msr_index, pdata);
+
+ if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) {
+ /* Bounce to user space */
+ return X86EMUL_IO_NEEDED;
+ }
+
+ return r;
}
static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 data)
{
- return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int r;
+
+ r = kvm_set_msr(vcpu, msr_index, data);
+
+ if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) {
+ /* Bounce to user space */
+ return X86EMUL_IO_NEEDED;
+ }
+
+ return r;
}
static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
int r;
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
bool writeback = true;
- bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
+ bool write_fault_to_spt;
+
+ if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
+ return 1;
vcpu->arch.l1tf_flush_l1d = true;
* Clear write_fault_to_shadow_pgtable here to ensure it is
* never reused.
*/
+ write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
vcpu->arch.write_fault_to_shadow_pgtable = false;
kvm_clear_exception_queue(vcpu);
goto out_free_x86_fpu_cache;
}
- shared_msrs = alloc_percpu(struct kvm_shared_msrs);
- if (!shared_msrs) {
- printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
+ user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
+ if (!user_return_msrs) {
+ printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
goto out_free_x86_emulator_cache;
}
return 0;
out_free_percpu:
- free_percpu(shared_msrs);
+ free_percpu(user_return_msrs);
out_free_x86_emulator_cache:
kmem_cache_destroy(x86_emulator_cache);
out_free_x86_fpu_cache:
#endif
kvm_x86_ops.hardware_enable = NULL;
kvm_mmu_module_exit();
- free_percpu(shared_msrs);
+ free_percpu(user_return_msrs);
kmem_cache_destroy(x86_fpu_cache);
}
bool req_immediate_exit = false;
if (kvm_request_pending(vcpu)) {
- if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) {
- if (unlikely(!kvm_x86_ops.nested_ops->get_vmcs12_pages(vcpu))) {
+ if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
+ if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
r = 0;
goto out;
}
kvm_vcpu_update_apicv(vcpu);
if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
kvm_check_async_pf_completion(vcpu);
+ if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
+ kvm_x86_ops.msr_filter_changed(vcpu);
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
kvm_x86_ops.request_immediate_exit(vcpu);
}
- trace_kvm_entry(vcpu->vcpu_id);
+ trace_kvm_entry(vcpu);
fpregs_assert_state_consistent();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
- struct msr_data msr;
struct kvm *kvm = vcpu->kvm;
kvm_hv_vcpu_postcreate(vcpu);
if (mutex_lock_killable(&vcpu->mutex))
return;
vcpu_load(vcpu);
- msr.data = 0x0;
- msr.index = MSR_IA32_TSC;
- msr.host_initiated = true;
- kvm_write_tsc(vcpu, &msr);
+ kvm_synchronize_tsc(vcpu, 0);
vcpu_put(vcpu);
/* poll control enabled by default */
u64 max_tsc = 0;
bool stable, backwards_tsc = false;
- kvm_shared_msr_cpu_online();
+ kvm_user_return_msr_cpu_online();
ret = kvm_x86_ops.hardware_enable();
if (ret != 0)
return ret;
void kvm_arch_destroy_vm(struct kvm *kvm)
{
+ u32 i;
+
if (current->mm == kvm->mm) {
/*
* Free memory regions allocated on behalf of userspace,
}
if (kvm_x86_ops.vm_destroy)
kvm_x86_ops.vm_destroy(kvm);
+ for (i = 0; i < kvm->arch.msr_filter.count; i++)
+ kfree(kvm->arch.msr_filter.ranges[i].bitmap);
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
kvm_free_vcpus(kvm);
}
EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
+/*
+ * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
+ * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
+ * indicates whether exit to userspace is needed.
+ */
+int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
+ struct x86_exception *e)
+{
+ if (r == X86EMUL_PROPAGATE_FAULT) {
+ kvm_inject_emulated_page_fault(vcpu, e);
+ return 1;
+ }
+
+ /*
+ * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
+ * while handling a VMX instruction KVM could've handled the request
+ * correctly by exiting to userspace and performing I/O but there
+ * doesn't seem to be a real use-case behind such requests, just return
+ * KVM_EXIT_INTERNAL_ERROR for now.
+ */
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_handle_memory_failure);
+
+int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
+{
+ bool pcid_enabled;
+ struct x86_exception e;
+ unsigned i;
+ unsigned long roots_to_free = 0;
+ struct {
+ u64 pcid;
+ u64 gla;
+ } operand;
+ int r;
+
+ r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
+
+ if (operand.pcid >> 12 != 0) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
+
+ switch (type) {
+ case INVPCID_TYPE_INDIV_ADDR:
+ if ((!pcid_enabled && (operand.pcid != 0)) ||
+ is_noncanonical_address(operand.gla, vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ case INVPCID_TYPE_SINGLE_CTXT:
+ if (!pcid_enabled && (operand.pcid != 0)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if (kvm_get_active_pcid(vcpu) == operand.pcid) {
+ kvm_mmu_sync_roots(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ }
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
+ == operand.pcid)
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+ kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
+ /*
+ * If neither the current cr3 nor any of the prev_roots use the
+ * given PCID, then nothing needs to be done here because a
+ * resync will happen anyway before switching to any other CR3.
+ */
+
+ return kvm_skip_emulated_instruction(vcpu);
+
+ case INVPCID_TYPE_ALL_NON_GLOBAL:
+ /*
+ * Currently, KVM doesn't mark global entries in the shadow
+ * page tables, so a non-global flush just degenerates to a
+ * global flush. If needed, we could optimize this later by
+ * keeping track of global entries in shadow page tables.
+ */
+
+ fallthrough;
+ case INVPCID_TYPE_ALL_INCL_GLOBAL:
+ kvm_mmu_unload(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ default:
+ BUG(); /* We have already checked above that type <= 3 */
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
return is_smm(vcpu) || kvm_x86_ops.apic_init_signal_blocked(vcpu);
}
-void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
int kvm_spec_ctrl_test_value(u64 value);
int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu);
+int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
+ struct x86_exception *e);
+int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva);
+bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
#define KVM_MSR_RET_INVALID 2
ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \
ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\
ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI), \
- ERSN(HYPERV), ERSN(ARM_NISV)
+ ERSN(HYPERV), ERSN(ARM_NISV), ERSN(X86_RDMSR), ERSN(X86_WRMSR)
TRACE_EVENT(kvm_userspace_exit,
TP_PROTO(__u32 reason, int errno),
#define KVM_EXIT_IOAPIC_EOI 26
#define KVM_EXIT_HYPERV 27
#define KVM_EXIT_ARM_NISV 28
+#define KVM_EXIT_X86_RDMSR 29
+#define KVM_EXIT_X86_WRMSR 30
/* For KVM_EXIT_INTERNAL_ERROR */
/* Emulate instruction failed. */
__u64 esr_iss;
__u64 fault_ipa;
} arm_nisv;
+ /* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */
+ struct {
+ __u8 error; /* user -> kernel */
+ __u8 pad[7];
+#define KVM_MSR_EXIT_REASON_INVAL (1 << 0)
+#define KVM_MSR_EXIT_REASON_UNKNOWN (1 << 1)
+#define KVM_MSR_EXIT_REASON_FILTER (1 << 2)
+ __u32 reason; /* kernel -> user */
+ __u32 index; /* kernel -> user */
+ __u64 data; /* kernel <-> user */
+ } msr;
/* Fix the size of the union. */
char padding[256];
};
#define KVM_VM_PPC_HV 1
#define KVM_VM_PPC_PR 2
-/* on MIPS, 0 forces trap & emulate, 1 forces VZ ASE */
-#define KVM_VM_MIPS_TE 0
+/* on MIPS, 0 indicates auto, 1 forces VZ ASE, 2 forces trap & emulate */
+#define KVM_VM_MIPS_AUTO 0
#define KVM_VM_MIPS_VZ 1
+#define KVM_VM_MIPS_TE 2
#define KVM_S390_SIE_PAGE_OFFSET 1
#define KVM_CAP_SMALLER_MAXPHYADDR 185
#define KVM_CAP_S390_DIAG318 186
#define KVM_CAP_STEAL_TIME 187
+#define KVM_CAP_X86_USER_SPACE_MSR 188
+#define KVM_CAP_X86_MSR_FILTER 189
#ifdef KVM_CAP_IRQ_ROUTING
/* Available with KVM_CAP_S390_PROTECTED */
#define KVM_S390_PV_COMMAND _IOWR(KVMIO, 0xc5, struct kvm_pv_cmd)
+/* Available with KVM_CAP_X86_MSR_FILTER */
+#define KVM_X86_SET_MSR_FILTER _IOW(KVMIO, 0xc6, struct kvm_msr_filter)
+
/* Secure Encrypted Virtualization command */
enum sev_cmd_id {
/* Guest initialization commands */
/x86_64/set_sregs_test
/x86_64/smm_test
/x86_64/state_test
+/x86_64/user_msr_test
/x86_64/vmx_preemption_timer_test
/x86_64/svm_vmcall_test
/x86_64/sync_regs_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
TEST_GEN_PROGS_x86_64 += x86_64/debug_regs
+TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test
+TEST_GEN_PROGS_x86_64 += x86_64/user_msr_test
TEST_GEN_PROGS_x86_64 += clear_dirty_log_test
TEST_GEN_PROGS_x86_64 += demand_paging_test
TEST_GEN_PROGS_x86_64 += dirty_log_test
#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
#define SECONDARY_EXEC_DESC 0x00000004
-#define SECONDARY_EXEC_RDTSCP 0x00000008
+#define SECONDARY_EXEC_ENABLE_RDTSCP 0x00000008
#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010
#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests for MSR_IA32_TSC and MSR_IA32_TSC_ADJUST.
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#include <stdio.h>
+#include <string.h>
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 0
+
+#define UNITY (1ull << 30)
+#define HOST_ADJUST (UNITY * 64)
+#define GUEST_STEP (UNITY * 4)
+#define ROUND(x) ((x + UNITY / 2) & -UNITY)
+#define rounded_rdmsr(x) ROUND(rdmsr(x))
+#define rounded_host_rdmsr(x) ROUND(vcpu_get_msr(vm, 0, x))
+
+#define GUEST_ASSERT_EQ(a, b) do { \
+ __typeof(a) _a = (a); \
+ __typeof(b) _b = (b); \
+ if (_a != _b) \
+ ucall(UCALL_ABORT, 4, \
+ "Failed guest assert: " \
+ #a " == " #b, __LINE__, _a, _b); \
+ } while(0)
+
+static void guest_code(void)
+{
+ u64 val = 0;
+
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /* Guest: writes to MSR_IA32_TSC affect both MSRs. */
+ val = 1ull * GUEST_STEP;
+ wrmsr(MSR_IA32_TSC, val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs. */
+ GUEST_SYNC(2);
+ val = 2ull * GUEST_STEP;
+ wrmsr(MSR_IA32_TSC_ADJUST, val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /* Host: setting the TSC offset. */
+ GUEST_SYNC(3);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /*
+ * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the
+ * host-side offset and affect both MSRs.
+ */
+ GUEST_SYNC(4);
+ val = 3ull * GUEST_STEP;
+ wrmsr(MSR_IA32_TSC_ADJUST, val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /*
+ * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side
+ * offset is now visible in MSR_IA32_TSC_ADJUST.
+ */
+ GUEST_SYNC(5);
+ val = 4ull * GUEST_STEP;
+ wrmsr(MSR_IA32_TSC, val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST);
+
+ GUEST_DONE();
+}
+
+static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage)
+{
+ struct ucall uc;
+
+ vcpu_args_set(vm, vcpuid, 1, vcpuid);
+
+ vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL);
+
+ switch (get_ucall(vm, vcpuid, &uc)) {
+ case UCALL_SYNC:
+ TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+ uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx",
+ stage + 1, (ulong)uc.args[1]);
+ return;
+ case UCALL_DONE:
+ return;
+ case UCALL_ABORT:
+ TEST_ASSERT(false, "%s at %s:%ld\n" \
+ "\tvalues: %#lx, %#lx", (const char *)uc.args[0],
+ __FILE__, uc.args[1], uc.args[2], uc.args[3]);
+ default:
+ TEST_ASSERT(false, "Unexpected exit: %s",
+ exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason));
+ }
+}
+
+int main(void)
+{
+ struct kvm_vm *vm;
+ uint64_t val;
+
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ val = 0;
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /* Guest: writes to MSR_IA32_TSC affect both MSRs. */
+ run_vcpu(vm, VCPU_ID, 1);
+ val = 1ull * GUEST_STEP;
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs. */
+ run_vcpu(vm, VCPU_ID, 2);
+ val = 2ull * GUEST_STEP;
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /*
+ * Host: writes to MSR_IA32_TSC set the host-side offset
+ * and therefore do not change MSR_IA32_TSC_ADJUST.
+ */
+ vcpu_set_msr(vm, 0, MSR_IA32_TSC, HOST_ADJUST + val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+ run_vcpu(vm, VCPU_ID, 3);
+
+ /* Host: writes to MSR_IA32_TSC_ADJUST do not modify the TSC. */
+ vcpu_set_msr(vm, 0, MSR_IA32_TSC_ADJUST, UNITY * 123456);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ ASSERT_EQ(vcpu_get_msr(vm, 0, MSR_IA32_TSC_ADJUST), UNITY * 123456);
+
+ /* Restore previous value. */
+ vcpu_set_msr(vm, 0, MSR_IA32_TSC_ADJUST, val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /*
+ * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the
+ * host-side offset and affect both MSRs.
+ */
+ run_vcpu(vm, VCPU_ID, 4);
+ val = 3ull * GUEST_STEP;
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /*
+ * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side
+ * offset is now visible in MSR_IA32_TSC_ADJUST.
+ */
+ run_vcpu(vm, VCPU_ID, 5);
+ val = 4ull * GUEST_STEP;
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST);
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tests for KVM_CAP_X86_USER_SPACE_MSR and KVM_X86_SET_MSR_FILTER
+ *
+ * Copyright (C) 2020, Amazon Inc.
+ *
+ * This is a functional test to verify that we can deflect MSR events
+ * into user space.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 5
+
+static u32 msr_reads, msr_writes;
+
+static u8 bitmap_00000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_00000000_write[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_40000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_c0000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_c0000000_read[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_deadbeef[1] = { 0x1 };
+
+static void deny_msr(uint8_t *bitmap, u32 msr)
+{
+ u32 idx = msr & (KVM_MSR_FILTER_MAX_BITMAP_SIZE - 1);
+
+ bitmap[idx / 8] &= ~(1 << (idx % 8));
+}
+
+static void prepare_bitmaps(void)
+{
+ memset(bitmap_00000000, 0xff, sizeof(bitmap_00000000));
+ memset(bitmap_00000000_write, 0xff, sizeof(bitmap_00000000_write));
+ memset(bitmap_40000000, 0xff, sizeof(bitmap_40000000));
+ memset(bitmap_c0000000, 0xff, sizeof(bitmap_c0000000));
+ memset(bitmap_c0000000_read, 0xff, sizeof(bitmap_c0000000_read));
+
+ deny_msr(bitmap_00000000_write, MSR_IA32_POWER_CTL);
+ deny_msr(bitmap_c0000000_read, MSR_SYSCALL_MASK);
+ deny_msr(bitmap_c0000000_read, MSR_GS_BASE);
+}
+
+struct kvm_msr_filter filter = {
+ .flags = KVM_MSR_FILTER_DEFAULT_DENY,
+ .ranges = {
+ {
+ .flags = KVM_MSR_FILTER_READ,
+ .base = 0x00000000,
+ .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+ .bitmap = bitmap_00000000,
+ }, {
+ .flags = KVM_MSR_FILTER_WRITE,
+ .base = 0x00000000,
+ .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+ .bitmap = bitmap_00000000_write,
+ }, {
+ .flags = KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE,
+ .base = 0x40000000,
+ .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+ .bitmap = bitmap_40000000,
+ }, {
+ .flags = KVM_MSR_FILTER_READ,
+ .base = 0xc0000000,
+ .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+ .bitmap = bitmap_c0000000_read,
+ }, {
+ .flags = KVM_MSR_FILTER_WRITE,
+ .base = 0xc0000000,
+ .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+ .bitmap = bitmap_c0000000,
+ }, {
+ .flags = KVM_MSR_FILTER_WRITE | KVM_MSR_FILTER_READ,
+ .base = 0xdeadbeef,
+ .nmsrs = 1,
+ .bitmap = bitmap_deadbeef,
+ },
+ },
+};
+
+struct kvm_msr_filter no_filter = {
+ .flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
+};
+
+static void guest_msr_calls(bool trapped)
+{
+ /* This goes into the in-kernel emulation */
+ wrmsr(MSR_SYSCALL_MASK, 0);
+
+ if (trapped) {
+ /* This goes into user space emulation */
+ GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) == MSR_SYSCALL_MASK);
+ GUEST_ASSERT(rdmsr(MSR_GS_BASE) == MSR_GS_BASE);
+ } else {
+ GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) != MSR_SYSCALL_MASK);
+ GUEST_ASSERT(rdmsr(MSR_GS_BASE) != MSR_GS_BASE);
+ }
+
+ /* If trapped == true, this goes into user space emulation */
+ wrmsr(MSR_IA32_POWER_CTL, 0x1234);
+
+ /* This goes into the in-kernel emulation */
+ rdmsr(MSR_IA32_POWER_CTL);
+
+ /* Invalid MSR, should always be handled by user space exit */
+ GUEST_ASSERT(rdmsr(0xdeadbeef) == 0xdeadbeef);
+ wrmsr(0xdeadbeef, 0x1234);
+}
+
+static void guest_code(void)
+{
+ guest_msr_calls(true);
+
+ /*
+ * Disable msr filtering, so that the kernel
+ * handles everything in the next round
+ */
+ GUEST_SYNC(0);
+
+ guest_msr_calls(false);
+
+ GUEST_DONE();
+}
+
+static int handle_ucall(struct kvm_vm *vm)
+{
+ struct ucall uc;
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("Guest assertion not met");
+ break;
+ case UCALL_SYNC:
+ vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &no_filter);
+ break;
+ case UCALL_DONE:
+ return 1;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+
+ return 0;
+}
+
+static void handle_rdmsr(struct kvm_run *run)
+{
+ run->msr.data = run->msr.index;
+ msr_reads++;
+
+ if (run->msr.index == MSR_SYSCALL_MASK ||
+ run->msr.index == MSR_GS_BASE) {
+ TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER,
+ "MSR read trap w/o access fault");
+ }
+
+ if (run->msr.index == 0xdeadbeef) {
+ TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN,
+ "MSR deadbeef read trap w/o inval fault");
+ }
+}
+
+static void handle_wrmsr(struct kvm_run *run)
+{
+ /* ignore */
+ msr_writes++;
+
+ if (run->msr.index == MSR_IA32_POWER_CTL) {
+ TEST_ASSERT(run->msr.data == 0x1234,
+ "MSR data for MSR_IA32_POWER_CTL incorrect");
+ TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER,
+ "MSR_IA32_POWER_CTL trap w/o access fault");
+ }
+
+ if (run->msr.index == 0xdeadbeef) {
+ TEST_ASSERT(run->msr.data == 0x1234,
+ "MSR data for deadbeef incorrect");
+ TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN,
+ "deadbeef trap w/o inval fault");
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_enable_cap cap = {
+ .cap = KVM_CAP_X86_USER_SPACE_MSR,
+ .args[0] = KVM_MSR_EXIT_REASON_INVAL |
+ KVM_MSR_EXIT_REASON_UNKNOWN |
+ KVM_MSR_EXIT_REASON_FILTER,
+ };
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ int rc;
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ run = vcpu_state(vm, VCPU_ID);
+
+ rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
+ TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
+ vm_enable_cap(vm, &cap);
+
+ rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER);
+ TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available");
+
+ prepare_bitmaps();
+ vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter);
+
+ while (1) {
+ rc = _vcpu_run(vm, VCPU_ID);
+
+ TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc);
+
+ switch (run->exit_reason) {
+ case KVM_EXIT_X86_RDMSR:
+ handle_rdmsr(run);
+ break;
+ case KVM_EXIT_X86_WRMSR:
+ handle_wrmsr(run);
+ break;
+ case KVM_EXIT_IO:
+ if (handle_ucall(vm))
+ goto done;
+ break;
+ }
+
+ }
+
+done:
+ TEST_ASSERT(msr_reads == 4, "Handled 4 rdmsr in user space");
+ TEST_ASSERT(msr_writes == 3, "Handled 3 wrmsr in user space");
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
struct eventfd_ctx *eventfd;
struct kvm_io_bus *bus;
int ret = -ENOENT;
+ bool wildcard;
eventfd = eventfd_ctx_fdget(args->fd);
if (IS_ERR(eventfd))
return PTR_ERR(eventfd);
+ wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
+
mutex_lock(&kvm->slots_lock);
list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
- bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
if (p->bus_idx != bus_idx ||
p->eventfd != eventfd ||
void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
struct kvm_io_device *dev)
{
- int i;
+ int i, j;
struct kvm_io_bus *new_bus, *bus;
bus = kvm_get_bus(kvm, bus_idx);
new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
GFP_KERNEL_ACCOUNT);
- if (!new_bus) {
+ if (new_bus) {
+ memcpy(new_bus, bus, struct_size(bus, range, i));
+ new_bus->dev_count--;
+ memcpy(new_bus->range + i, bus->range + i + 1,
+ flex_array_size(new_bus, range, new_bus->dev_count - i));
+ } else {
pr_err("kvm: failed to shrink bus, removing it completely\n");
- goto broken;
+ for (j = 0; j < bus->dev_count; j++) {
+ if (j == i)
+ continue;
+ kvm_iodevice_destructor(bus->range[j].dev);
+ }
}
- memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
- new_bus->dev_count--;
- memcpy(new_bus->range + i, bus->range + i + 1,
- (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
-
-broken:
rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
synchronize_srcu_expedited(&kvm->srcu);
kfree(bus);