boot, the measurement can be verified by comparing it to what the guest owner
expects.
+If len is zero on entry, the measurement blob length is written to len and
+uaddr is unused.
+
Parameters (in): struct kvm_sev_launch_measure
Returns: 0 on success, -negative on error
commands and signed with the PEK. The digest returned by the command should match the digest
used by the guest owner with the KVM_SEV_LAUNCH_MEASURE.
+If len is zero on entry, the measurement blob length is written to len and
+uaddr is unused.
+
Parameters (in): struct kvm_sev_attestation
Returns: 0 on success, -negative on error
__u32 len;
};
+11. KVM_SEV_SEND_START
+----------------------
+
+The KVM_SEV_SEND_START command can be used by the hypervisor to create an
+outgoing guest encryption context.
+
+If session_len is zero on entry, the length of the guest session information is
+written to session_len and all other fields are not used.
+
+Parameters (in): struct kvm_sev_send_start
+
+Returns: 0 on success, -negative on error
+
+::
+ struct kvm_sev_send_start {
+ __u32 policy; /* guest policy */
+
+ __u64 pdh_cert_uaddr; /* platform Diffie-Hellman certificate */
+ __u32 pdh_cert_len;
+
+ __u64 plat_certs_uaddr; /* platform certificate chain */
+ __u32 plat_certs_len;
+
+ __u64 amd_certs_uaddr; /* AMD certificate */
+ __u32 amd_certs_len;
+
+ __u64 session_uaddr; /* Guest session information */
+ __u32 session_len;
+ };
+
+12. KVM_SEV_SEND_UPDATE_DATA
+----------------------------
+
+The KVM_SEV_SEND_UPDATE_DATA command can be used by the hypervisor to encrypt the
+outgoing guest memory region with the encryption context creating using
+KVM_SEV_SEND_START.
+
+If hdr_len or trans_len are zero on entry, the length of the packet header and
+transport region are written to hdr_len and trans_len respectively, and all
+other fields are not used.
+
+Parameters (in): struct kvm_sev_send_update_data
+
+Returns: 0 on success, -negative on error
+
+::
+
+ struct kvm_sev_launch_send_update_data {
+ __u64 hdr_uaddr; /* userspace address containing the packet header */
+ __u32 hdr_len;
+
+ __u64 guest_uaddr; /* the source memory region to be encrypted */
+ __u32 guest_len;
+
+ __u64 trans_uaddr; /* the destination memory region */
+ __u32 trans_len;
+ };
+
+13. KVM_SEV_SEND_FINISH
+------------------------
+
+After completion of the migration flow, the KVM_SEV_SEND_FINISH command can be
+issued by the hypervisor to delete the encryption context.
+
+Returns: 0 on success, -negative on error
+
+14. KVM_SEV_SEND_CANCEL
+------------------------
+
+After completion of SEND_START, but before SEND_FINISH, the source VMM can issue the
+SEND_CANCEL command to stop a migration. This is necessary so that a cancelled
+migration can restart with a new target later.
+
+Returns: 0 on success, -negative on error
+
+15. KVM_SEV_RECEIVE_START
+------------------------
+
+The KVM_SEV_RECEIVE_START command is used for creating the memory encryption
+context for an incoming SEV guest. To create the encryption context, the user must
+provide a guest policy, the platform public Diffie-Hellman (PDH) key and session
+information.
+
+Parameters: struct kvm_sev_receive_start (in/out)
+
+Returns: 0 on success, -negative on error
+
+::
+
+ struct kvm_sev_receive_start {
+ __u32 handle; /* if zero then firmware creates a new handle */
+ __u32 policy; /* guest's policy */
+
+ __u64 pdh_uaddr; /* userspace address pointing to the PDH key */
+ __u32 pdh_len;
+
+ __u64 session_uaddr; /* userspace address which points to the guest session information */
+ __u32 session_len;
+ };
+
+On success, the 'handle' field contains a new handle and on error, a negative value.
+
+For more details, see SEV spec Section 6.12.
+
+16. KVM_SEV_RECEIVE_UPDATE_DATA
+----------------------------
+
+The KVM_SEV_RECEIVE_UPDATE_DATA command can be used by the hypervisor to copy
+the incoming buffers into the guest memory region with encryption context
+created during the KVM_SEV_RECEIVE_START.
+
+Parameters (in): struct kvm_sev_receive_update_data
+
+Returns: 0 on success, -negative on error
+
+::
+
+ struct kvm_sev_launch_receive_update_data {
+ __u64 hdr_uaddr; /* userspace address containing the packet header */
+ __u32 hdr_len;
+
+ __u64 guest_uaddr; /* the destination guest memory region */
+ __u32 guest_len;
+
+ __u64 trans_uaddr; /* the incoming buffer memory region */
+ __u32 trans_len;
+ };
+
+17. KVM_SEV_RECEIVE_FINISH
+------------------------
+
+After completion of the migration flow, the KVM_SEV_RECEIVE_FINISH command can be
+issued by the hypervisor to make the guest ready for execution.
+
+Returns: 0 on success, -negative on error
+
References
==========
====== ============================================================
EFAULT the msr index list cannot be read from or written to
- E2BIG the msr index list is to be to fit in the array specified by
+ E2BIG the msr index list is too big to fit in the array specified by
the user.
====== ============================================================
For ppc, the KVM_CAP_PPC_GUEST_DEBUG_SSTEP capability indicates whether
the single-step debug event (KVM_GUESTDBG_SINGLESTEP) is supported.
+Also when supported, KVM_CAP_SET_GUEST_DEBUG2 capability indicates the
+supported KVM_GUESTDBG_* bits in the control field.
+
When debug events exit the main run loop with the reason
KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run
structure containing architecture specific debug information.
Queues an SMI on the thread's vcpu.
-4.97 KVM_CAP_PPC_MULTITCE
--------------------------
+4.97 KVM_X86_SET_MSR_FILTER
+----------------------------
-:Capability: KVM_CAP_PPC_MULTITCE
-:Architectures: ppc
-:Type: vm
+:Capability: KVM_X86_SET_MSR_FILTER
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_msr_filter
+:Returns: 0 on success, < 0 on error
-This capability means the kernel is capable of handling hypercalls
-H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
-space. This significantly accelerates DMA operations for PPC KVM guests.
-User space should expect that its handlers for these hypercalls
-are not going to be called if user space previously registered LIOBN
-in KVM (via KVM_CREATE_SPAPR_TCE or similar calls).
+::
-In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
-user space might have to advertise it for the guest. For example,
-IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is
-present in the "ibm,hypertas-functions" device-tree property.
+ struct kvm_msr_filter_range {
+ #define KVM_MSR_FILTER_READ (1 << 0)
+ #define KVM_MSR_FILTER_WRITE (1 << 1)
+ __u32 flags;
+ __u32 nmsrs; /* number of msrs in bitmap */
+ __u32 base; /* MSR index the bitmap starts at */
+ __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */
+ };
-The hypercalls mentioned above may or may not be processed successfully
-in the kernel based fast path. If they can not be handled by the kernel,
-they will get passed on to user space. So user space still has to have
-an implementation for these despite the in kernel acceleration.
+ #define KVM_MSR_FILTER_MAX_RANGES 16
+ struct kvm_msr_filter {
+ #define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0)
+ #define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0)
+ __u32 flags;
+ struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES];
+ };
-This capability is always enabled.
+flags values for ``struct kvm_msr_filter_range``:
+
+``KVM_MSR_FILTER_READ``
+
+ Filter read accesses to MSRs using the given bitmap. A 0 in the bitmap
+ indicates that a read should immediately fail, while a 1 indicates that
+ a read for a particular MSR should be handled regardless of the default
+ filter action.
+
+``KVM_MSR_FILTER_WRITE``
+
+ Filter write accesses to MSRs using the given bitmap. A 0 in the bitmap
+ indicates that a write should immediately fail, while a 1 indicates that
+ a write for a particular MSR should be handled regardless of the default
+ filter action.
+
+``KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE``
+
+ Filter both read and write accesses to MSRs using the given bitmap. A 0
+ in the bitmap indicates that both reads and writes should immediately fail,
+ while a 1 indicates that reads and writes for a particular MSR are not
+ filtered by this range.
+
+flags values for ``struct kvm_msr_filter``:
+
+``KVM_MSR_FILTER_DEFAULT_ALLOW``
+
+ If no filter range matches an MSR index that is getting accessed, KVM will
+ fall back to allowing access to the MSR.
+
+``KVM_MSR_FILTER_DEFAULT_DENY``
+
+ If no filter range matches an MSR index that is getting accessed, KVM will
+ fall back to rejecting access to the MSR. In this mode, all MSRs that should
+ be processed by KVM need to explicitly be marked as allowed in the bitmaps.
+
+This ioctl allows user space to define up to 16 bitmaps of MSR ranges to
+specify whether a certain MSR access should be explicitly filtered for or not.
+
+If this ioctl has never been invoked, MSR accesses are not guarded and the
+default KVM in-kernel emulation behavior is fully preserved.
+
+Calling this ioctl with an empty set of ranges (all nmsrs == 0) disables MSR
+filtering. In that mode, ``KVM_MSR_FILTER_DEFAULT_DENY`` is invalid and causes
+an error.
+
+As soon as the filtering is in place, every MSR access is processed through
+the filtering except for accesses to the x2APIC MSRs (from 0x800 to 0x8ff);
+x2APIC MSRs are always allowed, independent of the ``default_allow`` setting,
+and their behavior depends on the ``X2APIC_ENABLE`` bit of the APIC base
+register.
+
+If a bit is within one of the defined ranges, read and write accesses are
+guarded by the bitmap's value for the MSR index if the kind of access
+is included in the ``struct kvm_msr_filter_range`` flags. If no range
+cover this particular access, the behavior is determined by the flags
+field in the kvm_msr_filter struct: ``KVM_MSR_FILTER_DEFAULT_ALLOW``
+and ``KVM_MSR_FILTER_DEFAULT_DENY``.
+
+Each bitmap range specifies a range of MSRs to potentially allow access on.
+The range goes from MSR index [base .. base+nmsrs]. The flags field
+indicates whether reads, writes or both reads and writes are filtered
+by setting a 1 bit in the bitmap for the corresponding MSR index.
+
+If an MSR access is not permitted through the filtering, it generates a
+#GP inside the guest. When combined with KVM_CAP_X86_USER_SPACE_MSR, that
+allows user space to deflect and potentially handle various MSR accesses
+into user space.
+
+If a vCPU is in running state while this ioctl is invoked, the vCPU may
+experience inconsistent filtering behavior on MSR accesses.
4.98 KVM_CREATE_SPAPR_TCE_64
----------------------------
KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
Sets the exception vector used to deliver Xen event channel upcalls.
-4.128 KVM_XEN_HVM_GET_ATTR
+4.127 KVM_XEN_HVM_GET_ATTR
--------------------------
:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
Allows Xen VM attributes to be read. For the structure and types,
see KVM_XEN_HVM_SET_ATTR above.
-4.129 KVM_XEN_VCPU_SET_ATTR
+4.128 KVM_XEN_VCPU_SET_ATTR
---------------------------
:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
or RUNSTATE_offline) to set the current accounted state as of the
adjusted state_entry_time.
-4.130 KVM_XEN_VCPU_GET_ATTR
+4.129 KVM_XEN_VCPU_GET_ATTR
---------------------------
:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
This capability can be used to check / enable 2nd DAWR feature provided
by POWER10 processor.
+7.24 KVM_CAP_VM_COPY_ENC_CONTEXT_FROM
+-------------------------------------
+
+Architectures: x86 SEV enabled
+Type: vm
+Parameters: args[0] is the fd of the source vm
+Returns: 0 on success; ENOTTY on error
+
+This capability enables userspace to copy encryption context from the vm
+indicated by the fd to the vm this is called on.
+
+This is intended to support in-guest workloads scheduled by the host. This
+allows the in-guest workload to maintain its own NPTs and keeps the two vms
+from accidentally clobbering each other with interrupts and the like (separate
+APIC/MSRs/etc).
+
+7.25 KVM_CAP_SGX_ATTRIBUTE
+----------------------
+
+:Architectures: x86
+:Target: VM
+:Parameters: args[0] is a file handle of a SGX attribute file in securityfs
+:Returns: 0 on success, -EINVAL if the file handle is invalid or if a requested
+ attribute is not supported by KVM.
+
+KVM_CAP_SGX_ATTRIBUTE enables a userspace VMM to grant a VM access to one or
+more priveleged enclave attributes. args[0] must hold a file handle to a valid
+SGX attribute file corresponding to an attribute that is supported/restricted
+by KVM (currently only PROVISIONKEY).
+
+The SGX subsystem restricts access to a subset of enclave attributes to provide
+additional security for an uncompromised kernel, e.g. use of the PROVISIONKEY
+is restricted to deter malware from using the PROVISIONKEY to obtain a stable
+system fingerprint. To prevent userspace from circumventing such restrictions
+by running an enclave in a VM, KVM prevents access to privileged attributes by
+default.
+
+See Documentation/x86/sgx/2.Kernel-internals.rst for more details.
+
8. Other capabilities.
======================
The KVM_XEN_HVM_CONFIG_RUNSTATE flag indicates that the runstate-related
features KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR/_CURRENT/_DATA/_ADJUST are
supported by the KVM_XEN_VCPU_SET_ATTR/KVM_XEN_VCPU_GET_ATTR ioctls.
+
+8.31 KVM_CAP_PPC_MULTITCE
+-------------------------
+
+:Capability: KVM_CAP_PPC_MULTITCE
+:Architectures: ppc
+:Type: vm
+
+This capability means the kernel is capable of handling hypercalls
+H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
+space. This significantly accelerates DMA operations for PPC KVM guests.
+User space should expect that its handlers for these hypercalls
+are not going to be called if user space previously registered LIOBN
+in KVM (via KVM_CREATE_SPAPR_TCE or similar calls).
+
+In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
+user space might have to advertise it for the guest. For example,
+IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is
+present in the "ibm,hypertas-functions" device-tree property.
+
+The hypercalls mentioned above may or may not be processed successfully
+in the kernel based fast path. If they can not be handled by the kernel,
+they will get passed on to user space. So user space still has to have
+an implementation for these despite the in kernel acceleration.
+
+This capability is always enabled.
following two cases:
1. Access Tracking: The SPTE is not present, but it is marked for access
- tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to
- restore the saved R/X bits. This is described in more detail later below.
+ tracking. That means we need to restore the saved R/X bits. This is
+ described in more detail later below.
-2. Write-Protection: The SPTE is present and the fault is
- caused by write-protect. That means we just need to change the W bit of
- the spte.
+2. Write-Protection: The SPTE is present and the fault is caused by
+ write-protect. That means we just need to change the W bit of the spte.
-What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and
-SPTE_MMU_WRITEABLE bit on the spte:
+What we use to avoid all the race is the Host-writable bit and MMU-writable bit
+on the spte:
-- SPTE_HOST_WRITEABLE means the gfn is writable on host.
-- SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when
- the gfn is writable on guest mmu and it is not write-protected by shadow
- page write-protection.
+- Host-writable means the gfn is writable in the host kernel page tables and in
+ its KVM memslot.
+- MMU-writable means the gfn is writable in the guest's mmu and it is not
+ write-protected by shadow page write-protection.
On fast page fault path, we will use cmpxchg to atomically set the spte W
-bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or
-restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This
-is safe because whenever changing these bits can be detected by cmpxchg.
+bit if spte.HOST_WRITEABLE = 1 and spte.WRITE_PROTECT = 1, to restore the saved
+R/X bits if for an access-traced spte, or both. This is safe because whenever
+changing these bits can be detected by cmpxchg.
But we need carefully check these cases:
Lockless Access Tracking:
This is used for Intel CPUs that are using EPT but do not support the EPT A/D
-bits. In this case, when the KVM MMU notifier is called to track accesses to a
-page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present
-by clearing the RWX bits in the PTE and storing the original R & X bits in
-some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the
-PTE (using the ignored bit 62). When the VM tries to access the page later on,
-a fault is generated and the fast page fault mechanism described above is used
-to atomically restore the PTE to a Present state. The W bit is not saved when
-the PTE is marked for access tracking and during restoration to the Present
-state, the W bit is set depending on whether or not it was a write access. If
-it wasn't, then the W bit will remain clear until a write access happens, at
-which time it will be set using the Dirty tracking mechanism described above.
+bits. In this case, PTEs are tagged as A/D disabled (using ignored bits), and
+when the KVM MMU notifier is called to track accesses to a page (via
+kvm_mmu_notifier_clear_flush_young), it marks the PTE not-present in hardware
+by clearing the RWX bits in the PTE and storing the original R & X bits in more
+unused/ignored bits. When the VM tries to access the page later on, a fault is
+generated and the fast page fault mechanism described above is used to
+atomically restore the PTE to a Present state. The W bit is not saved when the
+PTE is marked for access tracking and during restoration to the Present state,
+the W bit is set depending on whether or not it was a write access. If it
+wasn't, then the W bit will remain clear until a write access happens, at which
+time it will be set using the Dirty tracking mechanism described above.
3. Reference
------------
This function code is handled by userspace.
This diagnose function code has no subfunctions and uses no parameters.
+
+
+DIAGNOSE function code 'X'9C - Voluntary Time Slice Yield
+---------------------------------------------------------
+
+General register 1 contains the target CPU address.
+
+In a guest of a hypervisor like LPAR, KVM or z/VM using shared host CPUs,
+DIAGNOSE with function code 0x9c may improve system performance by
+yielding the host CPU on which the guest CPU is running to be assigned
+to another guest CPU, preferably the logical CPU containing the specified
+target CPU.
+
+
+DIAG 'X'9C forwarding
++++++++++++++++++++++
+
+The guest may send a DIAGNOSE 0x9c in order to yield to a certain
+other vcpu. An example is a Linux guest that tries to yield to the vcpu
+that is currently holding a spinlock, but not running.
+
+However, on the host the real cpu backing the vcpu may itself not be
+running.
+Forwarding the DIAGNOSE 0x9c initially sent by the guest to yield to
+the backing cpu will hopefully cause that cpu, and thus subsequently
+the guest's vcpu, to be scheduled.
+
+
+diag9c_forwarding_hz
+ KVM kernel parameter allowing to specify the maximum number of DIAGNOSE
+ 0x9c forwarding per second in the purpose of avoiding a DIAGNOSE 0x9c
+ forwarding storm.
+ A value of 0 turns the forwarding off.
configured with a library OS and run-time which permits the application to run.
The enclave run-time and library OS work together to execute the application
when a thread enters the enclave.
+
+Impact of Potential Kernel SGX Bugs
+===================================
+
+EPC leaks
+---------
+
+When EPC page leaks happen, a WARNING like this is shown in dmesg:
+
+"EREMOVE returned ... and an EPC page was leaked. SGX may become unusable..."
+
+This is effectively a kernel use-after-free of an EPC page, and due
+to the way SGX works, the bug is detected at freeing. Rather than
+adding the page back to the pool of available EPC pages, the kernel
+intentionally leaks the page to avoid additional errors in the future.
+
+When this happens, the kernel will likely soon leak more EPC pages, and
+SGX will likely become unusable because the memory available to SGX is
+limited. However, while this may be fatal to SGX, the rest of the kernel
+is unlikely to be impacted and should continue to work.
+
+As a result, when this happpens, user should stop running any new
+SGX workloads, (or just any new workloads), and migrate all valuable
+workloads. Although a machine reboot can recover all EPC memory, the bug
+should be reported to Linux developers.
+
+
+Virtual EPC
+===========
+
+The implementation has also a virtual EPC driver to support SGX enclaves
+in guests. Unlike the SGX driver, an EPC page allocated by the virtual
+EPC driver doesn't have a specific enclave associated with it. This is
+because KVM doesn't track how a guest uses EPC pages.
+
+As a result, the SGX core page reclaimer doesn't support reclaiming EPC
+pages allocated to KVM guests through the virtual EPC driver. If the
+user wants to deploy SGX applications both on the host and in guests
+on the same machine, the user should reserve enough EPC (by taking out
+total virtual EPC size of all SGX VMs from the physical EPC size) for
+host SGX applications so they can run with acceptable performance.
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/sgx
F: Documentation/x86/sgx.rst
F: arch/x86/entry/vdso/vsgx.S
+F: arch/x86/include/asm/sgx.h
F: arch/x86/include/uapi/asm/sgx.h
F: arch/x86/kernel/cpu/sgx/*
F: tools/testing/selftests/sgx/*
#define KVM_ARM64_PENDING_EXCEPTION (1 << 8) /* Exception pending */
#define KVM_ARM64_EXCEPT_MASK (7 << 9) /* Target EL/MODE */
+#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \
+ KVM_GUESTDBG_USE_SW_BP | \
+ KVM_GUESTDBG_USE_HW | \
+ KVM_GUESTDBG_SINGLESTEP)
/*
* When KVM_ARM64_PENDING_EXCEPTION is set, KVM_ARM64_EXCEPT_MASK can
* take the following values:
struct kvm_vcpu_events *events);
#define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm,
- unsigned long start, unsigned long end, unsigned flags);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_arm_halt_guest(struct kvm *kvm);
void kvm_arm_resume_guest(struct kvm *kvm);
case KVM_CAP_VCPU_ATTRIBUTES:
r = 1;
break;
+ case KVM_CAP_SET_GUEST_DEBUG2:
+ return KVM_GUESTDBG_VALID_MASK;
case KVM_CAP_ARM_SET_DEVICE_ADDR:
r = 1;
break;
}
void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+ const struct kvm_memory_slot *memslot)
{
kvm_flush_remote_tlbs(kvm);
}
return -EINVAL;
}
-#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \
- KVM_GUESTDBG_USE_SW_BP | \
- KVM_GUESTDBG_USE_HW | \
- KVM_GUESTDBG_SINGLESTEP)
-
/**
* kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging
* @kvm: pointer to the KVM struct
* gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
* the page we just got a reference to gets unmapped before we have a
* chance to grab the mmu_lock, which ensure that if the page gets
- * unmapped afterwards, the call to kvm_unmap_hva will take it away
+ * unmapped afterwards, the call to kvm_unmap_gfn will take it away
* from us again properly. This smp_rmb() interacts with the smp_wmb()
* in kvm_mmu_notifier_invalidate_<page|range_end>.
*/
return ret;
}
-static int handle_hva_to_gpa(struct kvm *kvm,
- unsigned long start,
- unsigned long end,
- int (*handler)(struct kvm *kvm,
- gpa_t gpa, u64 size,
- void *data),
- void *data)
-{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
- int ret = 0;
-
- slots = kvm_memslots(kvm);
-
- /* we only care about the pages that the guest sees */
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gpa;
-
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
-
- gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
- ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
- }
-
- return ret;
-}
-
-static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
- unsigned flags = *(unsigned *)data;
- bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
-
- __unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block);
- return 0;
-}
-
-int kvm_unmap_hva_range(struct kvm *kvm,
- unsigned long start, unsigned long end, unsigned flags)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
if (!kvm->arch.mmu.pgt)
return 0;
- trace_kvm_unmap_hva_range(start, end);
- handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
- return 0;
-}
-
-static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
- kvm_pfn_t *pfn = (kvm_pfn_t *)data;
-
- WARN_ON(size != PAGE_SIZE);
+ __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
+ (range->end - range->start) << PAGE_SHIFT,
+ range->may_block);
- /*
- * The MMU notifiers will have unmapped a huge PMD before calling
- * ->change_pte() (which in turn calls kvm_set_spte_hva()) and
- * therefore we never need to clear out a huge PMD through this
- * calling path and a memcache is not required.
- */
- kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
- __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
return 0;
}
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- unsigned long end = hva + PAGE_SIZE;
- kvm_pfn_t pfn = pte_pfn(pte);
+ kvm_pfn_t pfn = pte_pfn(range->pte);
if (!kvm->arch.mmu.pgt)
return 0;
- trace_kvm_set_spte_hva(hva);
+ WARN_ON(range->end - range->start != 1);
/*
* We've moved a page around, probably through CoW, so let's treat it
* just like a translation fault and clean the cache to the PoC.
*/
clean_dcache_guest_page(pfn, PAGE_SIZE);
- handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
+
+ /*
+ * The MMU notifiers will have unmapped a huge PMD before calling
+ * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
+ * therefore we never need to clear out a huge PMD through this
+ * calling path and a memcache is not required.
+ */
+ kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
+ PAGE_SIZE, __pfn_to_phys(pfn),
+ KVM_PGTABLE_PROT_R, NULL);
+
return 0;
}
-static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- pte_t pte;
+ u64 size = (range->end - range->start) << PAGE_SHIFT;
kvm_pte_t kpte;
+ pte_t pte;
+
+ if (!kvm->arch.mmu.pgt)
+ return 0;
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
- kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa);
+
+ kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT);
pte = __pte(kpte);
return pte_valid(pte) && pte_young(pte);
}
-static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
- WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
- return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa);
-}
-
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
if (!kvm->arch.mmu.pgt)
return 0;
- trace_kvm_age_hva(start, end);
- return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
-}
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
- if (!kvm->arch.mmu.pgt)
- return 0;
- trace_kvm_test_age_hva(hva);
- return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
- kvm_test_age_hva_handler, NULL);
+ return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT);
}
phys_addr_t kvm_mmu_get_httbr(void)
__entry->vcpu_pc, __entry->instr, __entry->cpsr)
);
-TRACE_EVENT(kvm_unmap_hva_range,
- TP_PROTO(unsigned long start, unsigned long end),
- TP_ARGS(start, end),
-
- TP_STRUCT__entry(
- __field( unsigned long, start )
- __field( unsigned long, end )
- ),
-
- TP_fast_assign(
- __entry->start = start;
- __entry->end = end;
- ),
-
- TP_printk("mmu notifier unmap range: %#016lx -- %#016lx",
- __entry->start, __entry->end)
-);
-
-TRACE_EVENT(kvm_set_spte_hva,
- TP_PROTO(unsigned long hva),
- TP_ARGS(hva),
-
- TP_STRUCT__entry(
- __field( unsigned long, hva )
- ),
-
- TP_fast_assign(
- __entry->hva = hva;
- ),
-
- TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva)
-);
-
-TRACE_EVENT(kvm_age_hva,
- TP_PROTO(unsigned long start, unsigned long end),
- TP_ARGS(start, end),
-
- TP_STRUCT__entry(
- __field( unsigned long, start )
- __field( unsigned long, end )
- ),
-
- TP_fast_assign(
- __entry->start = start;
- __entry->end = end;
- ),
-
- TP_printk("mmu notifier age hva: %#016lx -- %#016lx",
- __entry->start, __entry->end)
-);
-
-TRACE_EVENT(kvm_test_age_hva,
- TP_PROTO(unsigned long hva),
- TP_ARGS(hva),
-
- TP_STRUCT__entry(
- __field( unsigned long, hva )
- ),
-
- TP_fast_assign(
- __entry->hva = hva;
- ),
-
- TP_printk("mmu notifier test age hva: %#016lx", __entry->hva)
-);
-
TRACE_EVENT(kvm_set_way_flush,
TP_PROTO(unsigned long vcpu_pc, bool cache),
TP_ARGS(vcpu_pc, cache),
int (*vcpu_init)(struct kvm_vcpu *vcpu);
void (*vcpu_uninit)(struct kvm_vcpu *vcpu);
int (*vcpu_setup)(struct kvm_vcpu *vcpu);
- void (*flush_shadow_all)(struct kvm *kvm);
- /*
- * Must take care of flushing any cached GPA PTEs (e.g. guest entries in
- * VZ root TLB, or T&E GVA page tables and corresponding root TLB
- * mappings).
- */
- void (*flush_shadow_memslot)(struct kvm *kvm,
- const struct kvm_memory_slot *slot);
+ void (*prepare_flush_shadow)(struct kvm *kvm);
gpa_t (*gva_to_gpa)(gva_t gva);
void (*queue_timer_int)(struct kvm_vcpu *vcpu);
void (*dequeue_timer_int)(struct kvm_vcpu *vcpu);
bool write);
#define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm,
- unsigned long start, unsigned long end, unsigned flags);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
/* Emulation */
int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
+#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
+int kvm_arch_flush_remote_tlb(struct kvm *kvm);
+
#endif /* __MIPS_KVM_HOST_H__ */
{
/* Flush whole GPA */
kvm_mips_flush_gpa_pt(kvm, 0, ~0);
-
- /* Let implementation do the rest */
- kvm_mips_callbacks->flush_shadow_all(kvm);
+ kvm_flush_remote_tlbs(kvm);
}
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
/* Flush slot from GPA */
kvm_mips_flush_gpa_pt(kvm, slot->base_gfn,
slot->base_gfn + slot->npages - 1);
- /* Let implementation do the rest */
- kvm_mips_callbacks->flush_shadow_memslot(kvm, slot);
+ kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
spin_unlock(&kvm->mmu_lock);
}
/* Write protect GPA page table entries */
needs_flush = kvm_mips_mkclean_gpa_pt(kvm, new->base_gfn,
new->base_gfn + new->npages - 1);
- /* Let implementation do the rest */
if (needs_flush)
- kvm_mips_callbacks->flush_shadow_memslot(kvm, new);
+ kvm_arch_flush_remote_tlbs_memslot(kvm, new);
spin_unlock(&kvm->mmu_lock);
}
}
}
+int kvm_arch_flush_remote_tlb(struct kvm *kvm)
+{
+ kvm_mips_callbacks->prepare_flush_shadow(kvm);
+ return 1;
+}
+
void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+ const struct kvm_memory_slot *memslot)
{
- /* Let implementation handle TLB/GVA invalidation */
- kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot);
+ kvm_flush_remote_tlbs(kvm);
}
long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
end_gfn << PAGE_SHIFT);
}
-static int handle_hva_to_gpa(struct kvm *kvm,
- unsigned long start,
- unsigned long end,
- int (*handler)(struct kvm *kvm, gfn_t gfn,
- gpa_t gfn_end,
- struct kvm_memory_slot *memslot,
- void *data),
- void *data)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
- int ret = 0;
-
- slots = kvm_memslots(kvm);
-
- /* we only care about the pages that the guest sees */
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gfn, gfn_end;
-
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
-
- /*
- * {gfn(page) | page intersects with [hva_start, hva_end)} =
- * {gfn_start, gfn_start+1, ..., gfn_end-1}.
- */
- gfn = hva_to_gfn_memslot(hva_start, memslot);
- gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
- ret |= handler(kvm, gfn, gfn_end, memslot, data);
- }
-
- return ret;
-}
-
-
-static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
- struct kvm_memory_slot *memslot, void *data)
-{
- kvm_mips_flush_gpa_pt(kvm, gfn, gfn_end);
+ kvm_mips_flush_gpa_pt(kvm, range->start, range->end);
return 1;
}
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
- unsigned flags)
-{
- handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
-
- kvm_mips_callbacks->flush_shadow_all(kvm);
- return 0;
-}
-
-static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
- struct kvm_memory_slot *memslot, void *data)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- gpa_t gpa = gfn << PAGE_SHIFT;
- pte_t hva_pte = *(pte_t *)data;
+ gpa_t gpa = range->start << PAGE_SHIFT;
+ pte_t hva_pte = range->pte;
pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
pte_t old_pte;
if (!gpa_pte)
- return 0;
+ return false;
/* Mapping may need adjusting depending on memslot flags */
old_pte = *gpa_pte;
- if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
+ if (range->slot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
hva_pte = pte_mkclean(hva_pte);
- else if (memslot->flags & KVM_MEM_READONLY)
+ else if (range->slot->flags & KVM_MEM_READONLY)
hva_pte = pte_wrprotect(hva_pte);
set_pte(gpa_pte, hva_pte);
/* Replacing an absent or old page doesn't need flushes */
if (!pte_present(old_pte) || !pte_young(old_pte))
- return 0;
+ return false;
/* Pages swapped, aged, moved, or cleaned require flushes */
return !pte_present(hva_pte) ||
(pte_dirty(old_pte) && !pte_dirty(hva_pte));
}
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
-{
- unsigned long end = hva + PAGE_SIZE;
- int ret;
-
- ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte);
- if (ret)
- kvm_mips_callbacks->flush_shadow_all(kvm);
- return 0;
-}
-
-static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
- struct kvm_memory_slot *memslot, void *data)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm_mips_mkold_gpa_pt(kvm, gfn, gfn_end);
+ return kvm_mips_mkold_gpa_pt(kvm, range->start, range->end);
}
-static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
- struct kvm_memory_slot *memslot, void *data)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- gpa_t gpa = gfn << PAGE_SHIFT;
+ gpa_t gpa = range->start << PAGE_SHIFT;
pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
if (!gpa_pte)
return pte_young(*gpa_pte);
}
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
-{
- return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
-}
-
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
- return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
-}
-
/**
* _kvm_mips_map_page_fast() - Fast path GPA fault handler.
* @vcpu: VCPU pointer.
return 0;
}
-static void kvm_trap_emul_flush_shadow_all(struct kvm *kvm)
+static void kvm_trap_emul_prepare_flush_shadow(struct kvm *kvm)
{
- /* Flush GVA page tables and invalidate GVA ASIDs on all VCPUs */
- kvm_flush_remote_tlbs(kvm);
-}
-
-static void kvm_trap_emul_flush_shadow_memslot(struct kvm *kvm,
- const struct kvm_memory_slot *slot)
-{
- kvm_trap_emul_flush_shadow_all(kvm);
}
static u64 kvm_trap_emul_get_one_regs[] = {
.vcpu_init = kvm_trap_emul_vcpu_init,
.vcpu_uninit = kvm_trap_emul_vcpu_uninit,
.vcpu_setup = kvm_trap_emul_vcpu_setup,
- .flush_shadow_all = kvm_trap_emul_flush_shadow_all,
- .flush_shadow_memslot = kvm_trap_emul_flush_shadow_memslot,
+ .prepare_flush_shadow = kvm_trap_emul_prepare_flush_shadow,
.gva_to_gpa = kvm_trap_emul_gva_to_gpa_cb,
.queue_timer_int = kvm_mips_queue_timer_int_cb,
.dequeue_timer_int = kvm_mips_dequeue_timer_int_cb,
return 0;
}
-static void kvm_vz_flush_shadow_all(struct kvm *kvm)
+static void kvm_vz_prepare_flush_shadow(struct kvm *kvm)
{
- if (cpu_has_guestid) {
- /* Flush GuestID for each VCPU individually */
- kvm_flush_remote_tlbs(kvm);
- } else {
+ if (!cpu_has_guestid) {
/*
* For each CPU there is a single GPA ASID used by all VCPUs in
* the VM, so it doesn't make sense for the VCPUs to handle
* invalidation of these ASIDs individually.
*
* Instead mark all CPUs as needing ASID invalidation in
- * asid_flush_mask, and just use kvm_flush_remote_tlbs(kvm) to
+ * asid_flush_mask, and kvm_flush_remote_tlbs(kvm) will
* kick any running VCPUs so they check asid_flush_mask.
*/
cpumask_setall(&kvm->arch.asid_flush_mask);
- kvm_flush_remote_tlbs(kvm);
}
}
-static void kvm_vz_flush_shadow_memslot(struct kvm *kvm,
- const struct kvm_memory_slot *slot)
-{
- kvm_vz_flush_shadow_all(kvm);
-}
-
static void kvm_vz_vcpu_reenter(struct kvm_vcpu *vcpu)
{
int cpu = smp_processor_id();
.vcpu_init = kvm_vz_vcpu_init,
.vcpu_uninit = kvm_vz_vcpu_uninit,
.vcpu_setup = kvm_vz_vcpu_setup,
- .flush_shadow_all = kvm_vz_flush_shadow_all,
- .flush_shadow_memslot = kvm_vz_flush_shadow_memslot,
+ .prepare_flush_shadow = kvm_vz_prepare_flush_shadow,
.gva_to_gpa = kvm_vz_gva_to_gpa_cb,
.queue_timer_int = kvm_vz_queue_timer_int_cb,
.dequeue_timer_int = kvm_vz_dequeue_timer_int_cb,
unsigned int lpid);
extern int kvmppc_radix_init(void);
extern void kvmppc_radix_exit(void);
-extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn);
-extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn);
-extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn);
+extern bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn);
+extern bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn);
+extern bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn);
extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
struct kvm_memory_slot *memslot, unsigned long *map);
extern void kvmppc_radix_flush_memslot(struct kvm *kvm,
#define KVM_ARCH_WANT_MMU_NOTIFIER
-extern int kvm_unmap_hva_range(struct kvm *kvm,
- unsigned long start, unsigned long end,
- unsigned flags);
-extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
-extern int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-
#define HPTEG_CACHE_NUM (1 << 15)
#define HPTEG_HASH_BITS_PTE 13
#define HPTEG_HASH_BITS_PTE_LONG 12
const struct kvm_memory_slot *old,
const struct kvm_memory_slot *new,
enum kvm_mr_change change);
- int (*unmap_hva_range)(struct kvm *kvm, unsigned long start,
- unsigned long end);
- int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end);
- int (*test_age_hva)(struct kvm *kvm, unsigned long hva);
- void (*set_spte_hva)(struct kvm *kvm, unsigned long hva, pte_t pte);
+ bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range);
+ bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
+ bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
+ bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
void (*free_memslot)(struct kvm_memory_slot *slot);
int (*init_vm)(struct kvm *kvm);
void (*destroy_vm)(struct kvm *kvm);
kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new, change);
}
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
- unsigned flags)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end);
+ return kvm->arch.kvm_ops->unmap_gfn_range(kvm, range);
}
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm->arch.kvm_ops->age_hva(kvm, start, end);
+ return kvm->arch.kvm_ops->age_gfn(kvm, range);
}
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm->arch.kvm_ops->test_age_hva(kvm, hva);
+ return kvm->arch.kvm_ops->test_age_gfn(kvm, range);
}
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte);
- return 0;
+ return kvm->arch.kvm_ops->set_spte_gfn(kvm, range);
}
int kvmppc_core_init_vm(struct kvm *kvm)
extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
struct kvm_memory_slot *memslot);
-extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start,
- unsigned long end);
-extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long start,
- unsigned long end);
-extern int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva);
-extern void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte);
+extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu);
extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu);
srcu_read_unlock(&kvm->srcu, srcu_idx);
}
-typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn);
-
-static int kvm_handle_hva_range(struct kvm *kvm,
- unsigned long start,
- unsigned long end,
- hva_handler_fn handler)
-{
- int ret;
- int retval = 0;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
-
- slots = kvm_memslots(kvm);
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gfn, gfn_end;
-
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
- /*
- * {gfn(page) | page intersects with [hva_start, hva_end)} =
- * {gfn, gfn+1, ..., gfn_end-1}.
- */
- gfn = hva_to_gfn_memslot(hva_start, memslot);
- gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
- for (; gfn < gfn_end; ++gfn) {
- ret = handler(kvm, memslot, gfn);
- retval |= ret;
- }
- }
-
- return retval;
-}
-
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- hva_handler_fn handler)
-{
- return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
-}
-
/* Must be called with both HPTE and rmap locked */
static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
struct kvm_memory_slot *memslot,
}
}
-static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn)
+static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
{
unsigned long i;
__be64 *hptep;
unlock_rmap(rmapp);
__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
}
- return 0;
+ return false;
}
-int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range)
{
- hva_handler_fn handler;
+ if (kvm_is_radix(kvm))
+ return kvm_unmap_radix(kvm, range->slot, range->start);
- handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
- kvm_handle_hva_range(kvm, start, end, handler);
- return 0;
+ return kvm_unmap_rmapp(kvm, range->slot, range->start);
}
void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
}
}
-static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn)
+static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
{
struct revmap_entry *rev = kvm->arch.hpt.rev;
unsigned long head, i, j;
return ret;
}
-int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
{
- hva_handler_fn handler;
+ if (kvm_is_radix(kvm))
+ kvm_age_radix(kvm, range->slot, range->start);
- handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp;
- return kvm_handle_hva_range(kvm, start, end, handler);
+ return kvm_age_rmapp(kvm, range->slot, range->start);
}
-static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn)
+static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
{
struct revmap_entry *rev = kvm->arch.hpt.rev;
unsigned long head, i, j;
unsigned long *hp;
- int ret = 1;
+ bool ret = true;
unsigned long *rmapp;
rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
if (*rmapp & KVMPPC_RMAP_REFERENCED)
- return 1;
+ return true;
lock_rmap(rmapp);
if (*rmapp & KVMPPC_RMAP_REFERENCED)
goto out;
} while ((i = j) != head);
}
- ret = 0;
+ ret = false;
out:
unlock_rmap(rmapp);
return ret;
}
-int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
{
- hva_handler_fn handler;
+ if (kvm_is_radix(kvm))
+ kvm_test_age_radix(kvm, range->slot, range->start);
- handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp;
- return kvm_handle_hva(kvm, hva, handler);
+ return kvm_test_age_rmapp(kvm, range->slot, range->start);
}
-void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
{
- hva_handler_fn handler;
+ if (kvm_is_radix(kvm))
+ return kvm_unmap_radix(kvm, range->slot, range->start);
- handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
- kvm_handle_hva(kvm, hva, handler);
+ return kvm_unmap_rmapp(kvm, range->slot, range->start);
}
static int vcpus_running(struct kvm *kvm)
}
/* Called with kvm->mmu_lock held */
-int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn)
+bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
{
pte_t *ptep;
unsigned long gpa = gfn << PAGE_SHIFT;
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {
uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);
- return 0;
+ return false;
}
ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
if (ptep && pte_present(*ptep))
kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
kvm->arch.lpid);
- return 0;
+ return false;
}
/* Called with kvm->mmu_lock held */
-int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn)
+bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
{
pte_t *ptep;
unsigned long gpa = gfn << PAGE_SHIFT;
unsigned int shift;
- int ref = 0;
+ bool ref = false;
unsigned long old, *rmapp;
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
old & PTE_RPN_MASK,
1UL << shift);
- ref = 1;
+ ref = true;
}
return ref;
}
/* Called with kvm->mmu_lock held */
-int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn)
+bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
+
{
pte_t *ptep;
unsigned long gpa = gfn << PAGE_SHIFT;
unsigned int shift;
- int ref = 0;
+ bool ref = false;
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
return ref;
ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
if (ptep && pte_present(*ptep) && pte_young(*ptep))
- ref = 1;
+ ref = true;
return ref;
}
kvmhv_release_all_nested(kvm);
kvmppc_rmap_reset(kvm);
kvm->arch.process_table = 0;
- /* Mutual exclusion with kvm_unmap_hva_range etc. */
+ /* Mutual exclusion with kvm_unmap_gfn_range etc. */
spin_lock(&kvm->mmu_lock);
kvm->arch.radix = 0;
spin_unlock(&kvm->mmu_lock);
if (err)
return err;
kvmppc_rmap_reset(kvm);
- /* Mutual exclusion with kvm_unmap_hva_range etc. */
+ /* Mutual exclusion with kvm_unmap_gfn_range etc. */
spin_lock(&kvm->mmu_lock);
kvm->arch.radix = 1;
spin_unlock(&kvm->mmu_lock);
.flush_memslot = kvmppc_core_flush_memslot_hv,
.prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
.commit_memory_region = kvmppc_core_commit_memory_region_hv,
- .unmap_hva_range = kvm_unmap_hva_range_hv,
- .age_hva = kvm_age_hva_hv,
- .test_age_hva = kvm_test_age_hva_hv,
- .set_spte_hva = kvm_set_spte_hva_hv,
+ .unmap_gfn_range = kvm_unmap_gfn_range_hv,
+ .age_gfn = kvm_age_gfn_hv,
+ .test_age_gfn = kvm_test_age_gfn_hv,
+ .set_spte_gfn = kvm_set_spte_gfn_hv,
.free_memslot = kvmppc_core_free_memslot_hv,
.init_vm = kvmppc_core_init_vm_hv,
.destroy_vm = kvmppc_core_destroy_vm_hv,
}
/************* MMU Notifiers *************/
-static void do_kvm_unmap_hva(struct kvm *kvm, unsigned long start,
- unsigned long end)
+static bool do_kvm_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
long i;
struct kvm_vcpu *vcpu;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
- slots = kvm_memslots(kvm);
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gfn, gfn_end;
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvmppc_mmu_pte_pflush(vcpu, range->start << PAGE_SHIFT,
+ range->end << PAGE_SHIFT);
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
- /*
- * {gfn(page) | page intersects with [hva_start, hva_end)} =
- * {gfn, gfn+1, ..., gfn_end-1}.
- */
- gfn = hva_to_gfn_memslot(hva_start, memslot);
- gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvmppc_mmu_pte_pflush(vcpu, gfn << PAGE_SHIFT,
- gfn_end << PAGE_SHIFT);
- }
+ return false;
}
-static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start,
- unsigned long end)
+static bool kvm_unmap_gfn_range_pr(struct kvm *kvm, struct kvm_gfn_range *range)
{
- do_kvm_unmap_hva(kvm, start, end);
-
- return 0;
+ return do_kvm_unmap_gfn(kvm, range);
}
-static int kvm_age_hva_pr(struct kvm *kvm, unsigned long start,
- unsigned long end)
+static bool kvm_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
{
/* XXX could be more clever ;) */
- return 0;
+ return false;
}
-static int kvm_test_age_hva_pr(struct kvm *kvm, unsigned long hva)
+static bool kvm_test_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
{
/* XXX could be more clever ;) */
- return 0;
+ return false;
}
-static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned long hva, pte_t pte)
+static bool kvm_set_spte_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
{
/* The page will get remapped properly on its next fault */
- do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE);
+ return do_kvm_unmap_gfn(kvm, range);
}
/*****************************************/
.flush_memslot = kvmppc_core_flush_memslot_pr,
.prepare_memory_region = kvmppc_core_prepare_memory_region_pr,
.commit_memory_region = kvmppc_core_commit_memory_region_pr,
- .unmap_hva_range = kvm_unmap_hva_range_pr,
- .age_hva = kvm_age_hva_pr,
- .test_age_hva = kvm_test_age_hva_pr,
- .set_spte_hva = kvm_set_spte_hva_pr,
+ .unmap_gfn_range = kvm_unmap_gfn_range_pr,
+ .age_gfn = kvm_age_gfn_pr,
+ .test_age_gfn = kvm_test_age_gfn_pr,
+ .set_spte_gfn = kvm_set_spte_gfn_pr,
.free_memslot = kvmppc_core_free_memslot_pr,
.init_vm = kvmppc_core_init_vm_pr,
.destroy_vm = kvmppc_core_destroy_vm_pr,
/************* MMU Notifiers *************/
-static int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+static bool kvm_e500_mmu_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- trace_kvm_unmap_hva(hva);
-
/*
* Flush all shadow tlb entries everywhere. This is slow, but
* we are 100% sure that we catch the to be unmapped page
*/
- kvm_flush_remote_tlbs(kvm);
-
- return 0;
+ return true;
}
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
- unsigned flags)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- /* kvm_unmap_hva flushes everything anyways */
- kvm_unmap_hva(kvm, start);
-
- return 0;
+ return kvm_e500_mmu_unmap_gfn(kvm, range);
}
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
/* XXX could be more clever ;) */
- return 0;
+ return false;
}
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
/* XXX could be more clever ;) */
- return 0;
+ return false;
}
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
/* The page will get remapped properly on its next fault */
- kvm_unmap_hva(kvm, hva);
- return 0;
+ return kvm_e500_mmu_unmap_gfn(kvm, range);
}
/*****************************************/
)
);
-TRACE_EVENT(kvm_unmap_hva,
- TP_PROTO(unsigned long hva),
- TP_ARGS(hva),
-
- TP_STRUCT__entry(
- __field( unsigned long, hva )
- ),
-
- TP_fast_assign(
- __entry->hva = hva;
- ),
-
- TP_printk("unmap hva 0x%lx\n", __entry->hva)
-);
-
TRACE_EVENT(kvm_booke206_stlb_write,
TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3),
TP_ARGS(mas0, mas8, mas1, mas2, mas7_3),
u64 diagnose_44;
u64 diagnose_9c;
u64 diagnose_9c_ignored;
+ u64 diagnose_9c_forward;
u64 diagnose_258;
u64 diagnose_308;
u64 diagnose_500;
#define guestdbg_exit_pending(vcpu) (guestdbg_enabled(vcpu) && \
(vcpu->guest_debug & KVM_GUESTDBG_EXIT_PENDING))
+#define KVM_GUESTDBG_VALID_MASK \
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP |\
+ KVM_GUESTDBG_USE_HW_BP | KVM_GUESTDBG_EXIT_PENDING)
+
struct kvm_guestdbg_info_arch {
unsigned long cr0;
unsigned long cr9;
extern void __cpu_die(unsigned int cpu);
extern int __cpu_disable(void);
extern void schedule_mcck_handler(void);
+void notrace smp_yield_cpu(int cpu);
#endif /* __ASM_SMP_H */
asm volatile("diag %0,0,0x9c"
: : "d" (pcpu_devices[cpu].address));
}
+EXPORT_SYMBOL_GPL(smp_yield_cpu);
/*
* Send cpus emergency shutdown signal. This gives the cpus the
return 0;
}
+static int forward_cnt;
+static unsigned long cur_slice;
+
+static int diag9c_forwarding_overrun(void)
+{
+ /* Reset the count on a new slice */
+ if (time_after(jiffies, cur_slice)) {
+ cur_slice = jiffies;
+ forward_cnt = diag9c_forwarding_hz / HZ;
+ }
+ return forward_cnt-- <= 0 ? 1 : 0;
+}
+
static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu *tcpu;
if (!tcpu)
goto no_yield;
- /* target already running */
- if (READ_ONCE(tcpu->cpu) >= 0)
- goto no_yield;
+ /* target guest VCPU already running */
+ if (READ_ONCE(tcpu->cpu) >= 0) {
+ if (!diag9c_forwarding_hz || diag9c_forwarding_overrun())
+ goto no_yield;
+
+ /* target host CPU already running */
+ if (!vcpu_is_preempted(tcpu->cpu))
+ goto no_yield;
+ smp_yield_cpu(tcpu->cpu);
+ VCPU_EVENT(vcpu, 5,
+ "diag time slice end directed to %d: yield forwarded",
+ tid);
+ vcpu->stat.diagnose_9c_forward++;
+ return 0;
+ }
if (kvm_vcpu_yield_to(tcpu) <= 0)
goto no_yield;
* kvm_s390_shadow_tables - walk the guest page table and create shadow tables
* @sg: pointer to the shadow guest address space structure
* @saddr: faulting address in the shadow gmap
- * @pgt: pointer to the page table address result
+ * @pgt: pointer to the beginning of the page table for the given address if
+ * successful (return value 0), or to the first invalid DAT entry in
+ * case of exceptions (return value > 0)
* @fake: pgt references contiguous guest memory block, not a pgtable
*/
static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
rfte.val = ptr;
goto shadow_r2t;
}
+ *pgt = ptr + vaddr.rfx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
if (rc)
return rc;
rste.val = ptr;
goto shadow_r3t;
}
+ *pgt = ptr + vaddr.rsx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
if (rc)
return rc;
rtte.val = ptr;
goto shadow_sgt;
}
+ *pgt = ptr + vaddr.rtx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
if (rc)
return rc;
ste.val = ptr;
goto shadow_pgt;
}
+ *pgt = ptr + vaddr.sx * 8;
rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
if (rc)
return rc;
* @vcpu: virtual cpu
* @sg: pointer to the shadow guest address space structure
* @saddr: faulting address in the shadow gmap
+ * @datptr: will contain the address of the faulting DAT table entry, or of
+ * the valid leaf, plus some flags
*
* Returns: - 0 if the shadow fault was successfully resolved
* - > 0 (pgm exception code) on exceptions while faulting
* - -ENOMEM if out of memory
*/
int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
- unsigned long saddr)
+ unsigned long saddr, unsigned long *datptr)
{
union vaddress vaddr;
union page_table_entry pte;
- unsigned long pgt;
+ unsigned long pgt = 0;
int dat_protection, fake;
int rc;
pte.val = pgt + vaddr.px * PAGE_SIZE;
goto shadow_page;
}
- if (!rc)
- rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+
+ switch (rc) {
+ case PGM_SEGMENT_TRANSLATION:
+ case PGM_REGION_THIRD_TRANS:
+ case PGM_REGION_SECOND_TRANS:
+ case PGM_REGION_FIRST_TRANS:
+ pgt |= PEI_NOT_PTE;
+ break;
+ case 0:
+ pgt += vaddr.px * 8;
+ rc = gmap_read_table(sg->parent, pgt, &pte.val);
+ }
+ if (datptr)
+ *datptr = pgt | dat_protection * PEI_DAT_PROT;
if (!rc && pte.i)
rc = PGM_PAGE_TRANSLATION;
if (!rc && pte.z)
/**
* kvm_s390_real_to_abs - convert guest real address to guest absolute address
- * @vcpu - guest virtual cpu
+ * @prefix - guest prefix
* @gra - guest real address
*
* Returns the guest absolute address that corresponds to the passed guest real
- * address @gra of a virtual guest cpu by applying its prefix.
+ * address @gra of by applying the given prefix.
*/
-static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
- unsigned long gra)
+static inline unsigned long _kvm_s390_real_to_abs(u32 prefix, unsigned long gra)
{
- unsigned long prefix = kvm_s390_get_prefix(vcpu);
-
if (gra < 2 * PAGE_SIZE)
gra += prefix;
else if (gra >= prefix && gra < prefix + 2 * PAGE_SIZE)
return gra;
}
+/**
+ * kvm_s390_real_to_abs - convert guest real address to guest absolute address
+ * @vcpu - guest virtual cpu
+ * @gra - guest real address
+ *
+ * Returns the guest absolute address that corresponds to the passed guest real
+ * address @gra of a virtual guest cpu by applying its prefix.
+ */
+static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
+ unsigned long gra)
+{
+ return _kvm_s390_real_to_abs(kvm_s390_get_prefix(vcpu), gra);
+}
+
+/**
+ * _kvm_s390_logical_to_effective - convert guest logical to effective address
+ * @psw: psw of the guest
+ * @ga: guest logical address
+ *
+ * Convert a guest logical address to an effective address by applying the
+ * rules of the addressing mode defined by bits 31 and 32 of the given PSW
+ * (extendended/basic addressing mode).
+ *
+ * Depending on the addressing mode, the upper 40 bits (24 bit addressing
+ * mode), 33 bits (31 bit addressing mode) or no bits (64 bit addressing
+ * mode) of @ga will be zeroed and the remaining bits will be returned.
+ */
+static inline unsigned long _kvm_s390_logical_to_effective(psw_t *psw,
+ unsigned long ga)
+{
+ if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
+ return ga;
+ if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
+ return ga & ((1UL << 31) - 1);
+ return ga & ((1UL << 24) - 1);
+}
+
/**
* kvm_s390_logical_to_effective - convert guest logical to effective address
* @vcpu: guest virtual cpu
static inline unsigned long kvm_s390_logical_to_effective(struct kvm_vcpu *vcpu,
unsigned long ga)
{
- psw_t *psw = &vcpu->arch.sie_block->gpsw;
-
- if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
- return ga;
- if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
- return ga & ((1UL << 31) - 1);
- return ga & ((1UL << 24) - 1);
+ return _kvm_s390_logical_to_effective(&vcpu->arch.sie_block->gpsw, ga);
}
/*
int ipte_lock_held(struct kvm_vcpu *vcpu);
int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
+/* MVPG PEI indication bits */
+#define PEI_DAT_PROT 2
+#define PEI_NOT_PTE 4
+
int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
- unsigned long saddr);
+ unsigned long saddr, unsigned long *datptr);
#endif /* __KVM_S390_GACCESS_H */
VCPU_STAT("instruction_diag_44", diagnose_44),
VCPU_STAT("instruction_diag_9c", diagnose_9c),
VCPU_STAT("diag_9c_ignored", diagnose_9c_ignored),
+ VCPU_STAT("diag_9c_forward", diagnose_9c_forward),
VCPU_STAT("instruction_diag_258", diagnose_258),
VCPU_STAT("instruction_diag_308", diagnose_308),
VCPU_STAT("instruction_diag_500", diagnose_500),
module_param(use_gisa, bool, 0644);
MODULE_PARM_DESC(use_gisa, "Use the GISA if the host supports it.");
+/* maximum diag9c forwarding per second */
+unsigned int diag9c_forwarding_hz;
+module_param(diag9c_forwarding_hz, uint, 0644);
+MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off");
+
/*
* For now we handle at most 16 double words as this is what the s390 base
* kernel handles and stores in the prefix page. If we ever need to go beyond
case KVM_CAP_S390_DIAG318:
r = 1;
break;
+ case KVM_CAP_SET_GUEST_DEBUG2:
+ r = KVM_GUESTDBG_VALID_MASK;
+ break;
case KVM_CAP_S390_HPAGE_1M:
r = 0;
if (hpage && !kvm_is_ucontrol(kvm))
kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC;
kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val;
if (MACHINE_HAS_GS) {
+ preempt_disable();
__ctl_set_bit(2, 4);
if (vcpu->arch.gs_enabled)
save_gs_cb(current->thread.gs_cb);
- preempt_disable();
current->thread.gs_cb = vcpu->arch.host_gscb;
restore_gs_cb(vcpu->arch.host_gscb);
- preempt_enable();
if (!vcpu->arch.host_gscb)
__ctl_clear_bit(2, 4);
vcpu->arch.host_gscb = NULL;
+ preempt_enable();
}
/* SIE will save etoken directly into SDNX and therefore kvm_run */
}
/*
* As we are starting a second VCPU, we have to disable
* the IBS facility on all VCPUs to remove potentially
- * oustanding ENABLE requests.
+ * outstanding ENABLE requests.
*/
__disable_ibs_on_all_vcpus(vcpu->kvm);
}
* @kvm: the KVM guest
*/
void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm);
+
+/**
+ * diag9c_forwarding_hz
+ *
+ * Set the maximum number of diag9c forwarding per second
+ */
+extern unsigned int diag9c_forwarding_hz;
+
#endif
memcpy((void *)((u64)scb_o + 0xc0),
(void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
break;
- case ICPT_PARTEXEC:
- /* MVPG only */
- memcpy((void *)((u64)scb_o + 0xc0),
- (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
- break;
}
if (scb_s->ihcpu != 0xffffU)
/* with mso/msl, the prefix lies at offset *mso* */
prefix += scb_s->mso;
- rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+ rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL);
if (!rc && (scb_s->ecb & ECB_TE))
rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- prefix + PAGE_SIZE);
+ prefix + PAGE_SIZE, NULL);
/*
* We don't have to mprotect, we will be called for all unshadows.
* SIE will detect if protection applies and trigger a validity.
current->thread.gmap_addr, 1);
rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- current->thread.gmap_addr);
+ current->thread.gmap_addr, NULL);
if (rc > 0) {
rc = inject_fault(vcpu, rc,
current->thread.gmap_addr,
{
if (vsie_page->fault_addr)
kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- vsie_page->fault_addr);
+ vsie_page->fault_addr, NULL);
vsie_page->fault_addr = 0;
}
return 0;
}
+/*
+ * Get a register for a nested guest.
+ * @vcpu the vcpu of the guest
+ * @vsie_page the vsie_page for the nested guest
+ * @reg the register number, the upper 4 bits are ignored.
+ * returns: the value of the register.
+ */
+static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, u8 reg)
+{
+ /* no need to validate the parameter and/or perform error handling */
+ reg &= 0xf;
+ switch (reg) {
+ case 15:
+ return vsie_page->scb_s.gg15;
+ case 14:
+ return vsie_page->scb_s.gg14;
+ default:
+ return vcpu->run->s.regs.gprs[reg];
+ }
+}
+
+static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ unsigned long pei_dest, pei_src, src, dest, mask, prefix;
+ u64 *pei_block = &vsie_page->scb_o->mcic;
+ int edat, rc_dest, rc_src;
+ union ctlreg0 cr0;
+
+ cr0.val = vcpu->arch.sie_block->gcr[0];
+ edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
+ mask = _kvm_s390_logical_to_effective(&scb_s->gpsw, PAGE_MASK);
+ prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
+
+ dest = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 20) & mask;
+ dest = _kvm_s390_real_to_abs(prefix, dest) + scb_s->mso;
+ src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask;
+ src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso;
+
+ rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest);
+ rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src);
+ /*
+ * Either everything went well, or something non-critical went wrong
+ * e.g. because of a race. In either case, simply retry.
+ */
+ if (rc_dest == -EAGAIN || rc_src == -EAGAIN || (!rc_dest && !rc_src)) {
+ retry_vsie_icpt(vsie_page);
+ return -EAGAIN;
+ }
+ /* Something more serious went wrong, propagate the error */
+ if (rc_dest < 0)
+ return rc_dest;
+ if (rc_src < 0)
+ return rc_src;
+
+ /* The only possible suppressing exception: just deliver it */
+ if (rc_dest == PGM_TRANSLATION_SPEC || rc_src == PGM_TRANSLATION_SPEC) {
+ clear_vsie_icpt(vsie_page);
+ rc_dest = kvm_s390_inject_program_int(vcpu, PGM_TRANSLATION_SPEC);
+ WARN_ON_ONCE(rc_dest);
+ return 1;
+ }
+
+ /*
+ * Forward the PEI intercept to the guest if it was a page fault, or
+ * also for segment and region table faults if EDAT applies.
+ */
+ if (edat) {
+ rc_dest = rc_dest == PGM_ASCE_TYPE ? rc_dest : 0;
+ rc_src = rc_src == PGM_ASCE_TYPE ? rc_src : 0;
+ } else {
+ rc_dest = rc_dest != PGM_PAGE_TRANSLATION ? rc_dest : 0;
+ rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0;
+ }
+ if (!rc_dest && !rc_src) {
+ pei_block[0] = pei_dest;
+ pei_block[1] = pei_src;
+ return 1;
+ }
+
+ retry_vsie_icpt(vsie_page);
+
+ /*
+ * The host has edat, and the guest does not, or it was an ASCE type
+ * exception. The host needs to inject the appropriate DAT interrupts
+ * into the guest.
+ */
+ if (rc_dest)
+ return inject_fault(vcpu, rc_dest, dest, 1);
+ return inject_fault(vcpu, rc_src, src, 0);
+}
+
/*
* Run the vsie on a shadow scb and a shadow gmap, without any further
* sanity checks, handling SIE faults.
if ((scb_s->ipa & 0xf000) != 0xf000)
scb_s->ipa += 0x1000;
break;
+ case ICPT_PARTEXEC:
+ if (scb_s->ipa == 0xb254)
+ rc = vsie_handle_mvpg(vcpu, vsie_page);
+ break;
}
return rc;
}
depends on CRYPTO_SHA256=y
select SRCU
select MMU_NOTIFIER
+ select NUMA_KEEP_MEMINFO if NUMA
help
Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions
that can be used by applications to set aside private regions of code
#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */
#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
+#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */
+#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
+#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */
#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
#define DR7_FIXED_1 0x00000400
#define DR7_VOLATILE 0xffff2bff
+#define KVM_GUESTDBG_VALID_MASK \
+ (KVM_GUESTDBG_ENABLE | \
+ KVM_GUESTDBG_SINGLESTEP | \
+ KVM_GUESTDBG_USE_HW_BP | \
+ KVM_GUESTDBG_USE_SW_BP | \
+ KVM_GUESTDBG_INJECT_BP | \
+ KVM_GUESTDBG_INJECT_DB)
+
+
#define PFERR_PRESENT_BIT 0
#define PFERR_WRITE_BIT 1
#define PFERR_USER_BIT 2
#define PFERR_RSVD_BIT 3
#define PFERR_FETCH_BIT 4
#define PFERR_PK_BIT 5
+#define PFERR_SGX_BIT 15
#define PFERR_GUEST_FINAL_BIT 32
#define PFERR_GUEST_PAGE_BIT 33
#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
#define PFERR_PK_MASK (1U << PFERR_PK_BIT)
+#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT)
#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
u32 user_space_msr_mask;
struct kvm_x86_msr_filter __rcu *msr_filter;
+ /* Guest can access the SGX PROVISIONKEY. */
+ bool sgx_provisioning_allowed;
+
struct kvm_pmu_event_filter __rcu *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
bool tdp_mmu_enabled;
/*
- * List of struct kvmp_mmu_pages being used as roots.
+ * List of struct kvm_mmu_pages being used as roots.
* All struct kvm_mmu_pages in the list should have
* tdp_mmu_page set.
- * All struct kvm_mmu_pages in the list should have a positive
- * root_count except when a thread holds the MMU lock and is removing
- * an entry from the list.
+ *
+ * For reads, this list is protected by:
+ * the MMU lock in read mode + RCU or
+ * the MMU lock in write mode
+ *
+ * For writes, this list is protected by:
+ * the MMU lock in read mode + the tdp_mmu_pages_lock or
+ * the MMU lock in write mode
+ *
+ * Roots will remain in the list until their tdp_mmu_root_count
+ * drops to zero, at which point the thread that decremented the
+ * count to zero should removed the root from the list and clean
+ * it up, freeing the root after an RCU grace period.
*/
struct list_head tdp_mmu_roots;
/*
* List of struct kvmp_mmu_pages not being used as roots.
* All struct kvm_mmu_pages in the list should have
- * tdp_mmu_page set and a root_count of 0.
+ * tdp_mmu_page set and a tdp_mmu_root_count of 0.
*/
struct list_head tdp_mmu_pages;
/*
* Protects accesses to the following fields when the MMU lock
* is held in read mode:
+ * - tdp_mmu_roots (above)
* - tdp_mmu_pages (above)
* - the link field of struct kvm_mmu_pages used by the TDP MMU
* - lpage_disallowed_mmu_pages
u64 req_event;
u64 halt_poll_success_ns;
u64 halt_poll_fail_ns;
+ u64 nested_run;
+ u64 directed_yield_attempted;
+ u64 directed_yield_successful;
};
struct x86_instruction_info;
int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
- void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long pgd,
- int pgd_level);
+ void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+ int root_level);
bool (*has_wbinvd_exit)(void);
int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+ int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
int (*get_msr_feature)(struct kvm_msr_entry *entry);
struct kvm_x86_nested_ops {
int (*check_events)(struct kvm_vcpu *vcpu);
bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
+ void (*triple_fault)(struct kvm_vcpu *vcpu);
int (*get_state)(struct kvm_vcpu *vcpu,
struct kvm_nested_state __user *user_kvm_nested_state,
unsigned user_data_size);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
void kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
- u64 acc_track_mask, u64 me_mask);
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
struct kvm_memory_slot *memslot);
-void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
- struct kvm_memory_slot *memslot);
void kvm_mmu_zap_all(struct kvm *kvm);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data);
int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu);
int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu);
+int kvm_emulate_as_nop(struct kvm_vcpu *vcpu);
+int kvm_emulate_invd(struct kvm_vcpu *vcpu);
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu);
+int kvm_handle_invalid_op(struct kvm_vcpu *vcpu);
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu);
int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
-int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
+int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu);
int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
-bool kvm_rdpmc(struct kvm_vcpu *vcpu);
+int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu);
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
-int kvm_mmu_load(struct kvm_vcpu *vcpu);
-void kvm_mmu_unload(struct kvm_vcpu *vcpu);
-void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
ulong roots_to_free);
gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
_ASM_EXTABLE(666b, 667b)
#define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
- unsigned flags);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+
int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_cpu_has_extint(struct kvm_vcpu *v);
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+/**
+ * Copyright(c) 2016-20 Intel Corporation.
+ *
+ * Intel Software Guard Extensions (SGX) support.
+ */
+#ifndef _ASM_X86_SGX_H
+#define _ASM_X86_SGX_H
+
+#include <linux/bits.h>
+#include <linux/types.h>
+
+/*
+ * This file contains both data structures defined by SGX architecture and Linux
+ * defined software data structures and functions. The two should not be mixed
+ * together for better readibility. The architectural definitions come first.
+ */
+
+/* The SGX specific CPUID function. */
+#define SGX_CPUID 0x12
+/* EPC enumeration. */
+#define SGX_CPUID_EPC 2
+/* An invalid EPC section, i.e. the end marker. */
+#define SGX_CPUID_EPC_INVALID 0x0
+/* A valid EPC section. */
+#define SGX_CPUID_EPC_SECTION 0x1
+/* The bitmask for the EPC section type. */
+#define SGX_CPUID_EPC_MASK GENMASK(3, 0)
+
+enum sgx_encls_function {
+ ECREATE = 0x00,
+ EADD = 0x01,
+ EINIT = 0x02,
+ EREMOVE = 0x03,
+ EDGBRD = 0x04,
+ EDGBWR = 0x05,
+ EEXTEND = 0x06,
+ ELDU = 0x08,
+ EBLOCK = 0x09,
+ EPA = 0x0A,
+ EWB = 0x0B,
+ ETRACK = 0x0C,
+ EAUG = 0x0D,
+ EMODPR = 0x0E,
+ EMODT = 0x0F,
+};
+
+/**
+ * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
+ * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not
+ * been completed yet.
+ * %SGX_CHILD_PRESENT SECS has child pages present in the EPC.
+ * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's
+ * public key does not match IA32_SGXLEPUBKEYHASH.
+ * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received
+ */
+enum sgx_return_code {
+ SGX_NOT_TRACKED = 11,
+ SGX_CHILD_PRESENT = 13,
+ SGX_INVALID_EINITTOKEN = 16,
+ SGX_UNMASKED_EVENT = 128,
+};
+
+/* The modulus size for 3072-bit RSA keys. */
+#define SGX_MODULUS_SIZE 384
+
+/**
+ * enum sgx_miscselect - additional information to an SSA frame
+ * %SGX_MISC_EXINFO: Report #PF or #GP to the SSA frame.
+ *
+ * Save State Area (SSA) is a stack inside the enclave used to store processor
+ * state when an exception or interrupt occurs. This enum defines additional
+ * information stored to an SSA frame.
+ */
+enum sgx_miscselect {
+ SGX_MISC_EXINFO = BIT(0),
+};
+
+#define SGX_MISC_RESERVED_MASK GENMASK_ULL(63, 1)
+
+#define SGX_SSA_GPRS_SIZE 184
+#define SGX_SSA_MISC_EXINFO_SIZE 16
+
+/**
+ * enum sgx_attributes - the attributes field in &struct sgx_secs
+ * %SGX_ATTR_INIT: Enclave can be entered (is initialized).
+ * %SGX_ATTR_DEBUG: Allow ENCLS(EDBGRD) and ENCLS(EDBGWR).
+ * %SGX_ATTR_MODE64BIT: Tell that this a 64-bit enclave.
+ * %SGX_ATTR_PROVISIONKEY: Allow to use provisioning keys for remote
+ * attestation.
+ * %SGX_ATTR_KSS: Allow to use key separation and sharing (KSS).
+ * %SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to
+ * sign cryptographic tokens that can be passed to
+ * EINIT as an authorization to run an enclave.
+ */
+enum sgx_attribute {
+ SGX_ATTR_INIT = BIT(0),
+ SGX_ATTR_DEBUG = BIT(1),
+ SGX_ATTR_MODE64BIT = BIT(2),
+ SGX_ATTR_PROVISIONKEY = BIT(4),
+ SGX_ATTR_EINITTOKENKEY = BIT(5),
+ SGX_ATTR_KSS = BIT(7),
+};
+
+#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | BIT_ULL(6) | GENMASK_ULL(63, 8))
+
+/**
+ * struct sgx_secs - SGX Enclave Control Structure (SECS)
+ * @size: size of the address space
+ * @base: base address of the address space
+ * @ssa_frame_size: size of an SSA frame
+ * @miscselect: additional information stored to an SSA frame
+ * @attributes: attributes for enclave
+ * @xfrm: XSave-Feature Request Mask (subset of XCR0)
+ * @mrenclave: SHA256-hash of the enclave contents
+ * @mrsigner: SHA256-hash of the public key used to sign the SIGSTRUCT
+ * @config_id: a user-defined value that is used in key derivation
+ * @isv_prod_id: a user-defined value that is used in key derivation
+ * @isv_svn: a user-defined value that is used in key derivation
+ * @config_svn: a user-defined value that is used in key derivation
+ *
+ * SGX Enclave Control Structure (SECS) is a special enclave page that is not
+ * visible in the address space. In fact, this structure defines the address
+ * range and other global attributes for the enclave and it is the first EPC
+ * page created for any enclave. It is moved from a temporary buffer to an EPC
+ * by the means of ENCLS[ECREATE] function.
+ */
+struct sgx_secs {
+ u64 size;
+ u64 base;
+ u32 ssa_frame_size;
+ u32 miscselect;
+ u8 reserved1[24];
+ u64 attributes;
+ u64 xfrm;
+ u32 mrenclave[8];
+ u8 reserved2[32];
+ u32 mrsigner[8];
+ u8 reserved3[32];
+ u32 config_id[16];
+ u16 isv_prod_id;
+ u16 isv_svn;
+ u16 config_svn;
+ u8 reserved4[3834];
+} __packed;
+
+/**
+ * enum sgx_tcs_flags - execution flags for TCS
+ * %SGX_TCS_DBGOPTIN: If enabled allows single-stepping and breakpoints
+ * inside an enclave. It is cleared by EADD but can
+ * be set later with EDBGWR.
+ */
+enum sgx_tcs_flags {
+ SGX_TCS_DBGOPTIN = 0x01,
+};
+
+#define SGX_TCS_RESERVED_MASK GENMASK_ULL(63, 1)
+#define SGX_TCS_RESERVED_SIZE 4024
+
+/**
+ * struct sgx_tcs - Thread Control Structure (TCS)
+ * @state: used to mark an entered TCS
+ * @flags: execution flags (cleared by EADD)
+ * @ssa_offset: SSA stack offset relative to the enclave base
+ * @ssa_index: the current SSA frame index (cleard by EADD)
+ * @nr_ssa_frames: the number of frame in the SSA stack
+ * @entry_offset: entry point offset relative to the enclave base
+ * @exit_addr: address outside the enclave to exit on an exception or
+ * interrupt
+ * @fs_offset: offset relative to the enclave base to become FS
+ * segment inside the enclave
+ * @gs_offset: offset relative to the enclave base to become GS
+ * segment inside the enclave
+ * @fs_limit: size to become a new FS-limit (only 32-bit enclaves)
+ * @gs_limit: size to become a new GS-limit (only 32-bit enclaves)
+ *
+ * Thread Control Structure (TCS) is an enclave page visible in its address
+ * space that defines an entry point inside the enclave. A thread enters inside
+ * an enclave by supplying address of TCS to ENCLU(EENTER). A TCS can be entered
+ * by only one thread at a time.
+ */
+struct sgx_tcs {
+ u64 state;
+ u64 flags;
+ u64 ssa_offset;
+ u32 ssa_index;
+ u32 nr_ssa_frames;
+ u64 entry_offset;
+ u64 exit_addr;
+ u64 fs_offset;
+ u64 gs_offset;
+ u32 fs_limit;
+ u32 gs_limit;
+ u8 reserved[SGX_TCS_RESERVED_SIZE];
+} __packed;
+
+/**
+ * struct sgx_pageinfo - an enclave page descriptor
+ * @addr: address of the enclave page
+ * @contents: pointer to the page contents
+ * @metadata: pointer either to a SECINFO or PCMD instance
+ * @secs: address of the SECS page
+ */
+struct sgx_pageinfo {
+ u64 addr;
+ u64 contents;
+ u64 metadata;
+ u64 secs;
+} __packed __aligned(32);
+
+
+/**
+ * enum sgx_page_type - bits in the SECINFO flags defining the page type
+ * %SGX_PAGE_TYPE_SECS: a SECS page
+ * %SGX_PAGE_TYPE_TCS: a TCS page
+ * %SGX_PAGE_TYPE_REG: a regular page
+ * %SGX_PAGE_TYPE_VA: a VA page
+ * %SGX_PAGE_TYPE_TRIM: a page in trimmed state
+ */
+enum sgx_page_type {
+ SGX_PAGE_TYPE_SECS,
+ SGX_PAGE_TYPE_TCS,
+ SGX_PAGE_TYPE_REG,
+ SGX_PAGE_TYPE_VA,
+ SGX_PAGE_TYPE_TRIM,
+};
+
+#define SGX_NR_PAGE_TYPES 5
+#define SGX_PAGE_TYPE_MASK GENMASK(7, 0)
+
+/**
+ * enum sgx_secinfo_flags - the flags field in &struct sgx_secinfo
+ * %SGX_SECINFO_R: allow read
+ * %SGX_SECINFO_W: allow write
+ * %SGX_SECINFO_X: allow execution
+ * %SGX_SECINFO_SECS: a SECS page
+ * %SGX_SECINFO_TCS: a TCS page
+ * %SGX_SECINFO_REG: a regular page
+ * %SGX_SECINFO_VA: a VA page
+ * %SGX_SECINFO_TRIM: a page in trimmed state
+ */
+enum sgx_secinfo_flags {
+ SGX_SECINFO_R = BIT(0),
+ SGX_SECINFO_W = BIT(1),
+ SGX_SECINFO_X = BIT(2),
+ SGX_SECINFO_SECS = (SGX_PAGE_TYPE_SECS << 8),
+ SGX_SECINFO_TCS = (SGX_PAGE_TYPE_TCS << 8),
+ SGX_SECINFO_REG = (SGX_PAGE_TYPE_REG << 8),
+ SGX_SECINFO_VA = (SGX_PAGE_TYPE_VA << 8),
+ SGX_SECINFO_TRIM = (SGX_PAGE_TYPE_TRIM << 8),
+};
+
+#define SGX_SECINFO_PERMISSION_MASK GENMASK_ULL(2, 0)
+#define SGX_SECINFO_PAGE_TYPE_MASK (SGX_PAGE_TYPE_MASK << 8)
+#define SGX_SECINFO_RESERVED_MASK ~(SGX_SECINFO_PERMISSION_MASK | \
+ SGX_SECINFO_PAGE_TYPE_MASK)
+
+/**
+ * struct sgx_secinfo - describes attributes of an EPC page
+ * @flags: permissions and type
+ *
+ * Used together with ENCLS leaves that add or modify an EPC page to an
+ * enclave to define page permissions and type.
+ */
+struct sgx_secinfo {
+ u64 flags;
+ u8 reserved[56];
+} __packed __aligned(64);
+
+#define SGX_PCMD_RESERVED_SIZE 40
+
+/**
+ * struct sgx_pcmd - Paging Crypto Metadata (PCMD)
+ * @enclave_id: enclave identifier
+ * @mac: MAC over PCMD, page contents and isvsvn
+ *
+ * PCMD is stored for every swapped page to the regular memory. When ELDU loads
+ * the page back it recalculates the MAC by using a isvsvn number stored in a
+ * VA page. Together these two structures bring integrity and rollback
+ * protection.
+ */
+struct sgx_pcmd {
+ struct sgx_secinfo secinfo;
+ u64 enclave_id;
+ u8 reserved[SGX_PCMD_RESERVED_SIZE];
+ u8 mac[16];
+} __packed __aligned(128);
+
+#define SGX_SIGSTRUCT_RESERVED1_SIZE 84
+#define SGX_SIGSTRUCT_RESERVED2_SIZE 20
+#define SGX_SIGSTRUCT_RESERVED3_SIZE 32
+#define SGX_SIGSTRUCT_RESERVED4_SIZE 12
+
+/**
+ * struct sgx_sigstruct_header - defines author of the enclave
+ * @header1: constant byte string
+ * @vendor: must be either 0x0000 or 0x8086
+ * @date: YYYYMMDD in BCD
+ * @header2: costant byte string
+ * @swdefined: software defined value
+ */
+struct sgx_sigstruct_header {
+ u64 header1[2];
+ u32 vendor;
+ u32 date;
+ u64 header2[2];
+ u32 swdefined;
+ u8 reserved1[84];
+} __packed;
+
+/**
+ * struct sgx_sigstruct_body - defines contents of the enclave
+ * @miscselect: additional information stored to an SSA frame
+ * @misc_mask: required miscselect in SECS
+ * @attributes: attributes for enclave
+ * @xfrm: XSave-Feature Request Mask (subset of XCR0)
+ * @attributes_mask: required attributes in SECS
+ * @xfrm_mask: required XFRM in SECS
+ * @mrenclave: SHA256-hash of the enclave contents
+ * @isvprodid: a user-defined value that is used in key derivation
+ * @isvsvn: a user-defined value that is used in key derivation
+ */
+struct sgx_sigstruct_body {
+ u32 miscselect;
+ u32 misc_mask;
+ u8 reserved2[20];
+ u64 attributes;
+ u64 xfrm;
+ u64 attributes_mask;
+ u64 xfrm_mask;
+ u8 mrenclave[32];
+ u8 reserved3[32];
+ u16 isvprodid;
+ u16 isvsvn;
+} __packed;
+
+/**
+ * struct sgx_sigstruct - an enclave signature
+ * @header: defines author of the enclave
+ * @modulus: the modulus of the public key
+ * @exponent: the exponent of the public key
+ * @signature: the signature calculated over the fields except modulus,
+ * @body: defines contents of the enclave
+ * @q1: a value used in RSA signature verification
+ * @q2: a value used in RSA signature verification
+ *
+ * Header and body are the parts that are actual signed. The remaining fields
+ * define the signature of the enclave.
+ */
+struct sgx_sigstruct {
+ struct sgx_sigstruct_header header;
+ u8 modulus[SGX_MODULUS_SIZE];
+ u32 exponent;
+ u8 signature[SGX_MODULUS_SIZE];
+ struct sgx_sigstruct_body body;
+ u8 reserved4[12];
+ u8 q1[SGX_MODULUS_SIZE];
+ u8 q2[SGX_MODULUS_SIZE];
+} __packed;
+
+#define SGX_LAUNCH_TOKEN_SIZE 304
+
+/*
+ * Do not put any hardware-defined SGX structure representations below this
+ * comment!
+ */
+
+#ifdef CONFIG_X86_SGX_KVM
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+ int *trapnr);
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs, u64 *lepubkeyhash, int *trapnr);
+#endif
+
+int sgx_set_attribute(unsigned long *allowed_attributes,
+ unsigned int attribute_fd);
+
+#endif /* _ASM_X86_SGX_H */
* SEV-ES guests when referenced through the GHCB or for
* saving to the host save area.
*/
- u8 reserved_7[80];
+ u8 reserved_7[72];
+ u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */
+ u8 reserved_7b[4];
u32 pkru;
u8 reserved_7a[20];
u64 reserved_8; /* rax already available at 0x01f8 */
#define GUEST_INTR_STATE_MOV_SS 0x00000002
#define GUEST_INTR_STATE_SMI 0x00000004
#define GUEST_INTR_STATE_NMI 0x00000008
+#define GUEST_INTR_STATE_ENCLAVE_INTR 0x00000010
/* GUEST_ACTIVITY_STATE flags */
#define GUEST_ACTIVITY_ACTIVE 0
#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
+#define VMX_EXIT_REASONS_SGX_ENCLAVE_MODE 0x08000000
#define EXIT_REASON_EXCEPTION_NMI 0
#define EXIT_REASON_EXTERNAL_INTERRUPT 1
{ X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW },
{ X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES },
{ X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA },
+ { X86_FEATURE_SGX_LC, X86_FEATURE_SGX },
+ { X86_FEATURE_SGX1, X86_FEATURE_SGX },
+ { X86_FEATURE_SGX2, X86_FEATURE_SGX1 },
{}
};
}
#endif /* CONFIG_X86_VMX_FEATURE_NAMES */
-static void clear_sgx_caps(void)
-{
- setup_clear_cpu_cap(X86_FEATURE_SGX);
- setup_clear_cpu_cap(X86_FEATURE_SGX_LC);
-}
-
static int __init nosgx(char *str)
{
- clear_sgx_caps();
+ setup_clear_cpu_cap(X86_FEATURE_SGX);
return 0;
}
void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
{
+ bool enable_sgx_kvm = false, enable_sgx_driver = false;
bool tboot = tboot_enabled();
- bool enable_sgx;
+ bool enable_vmx;
u64 msr;
if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) {
clear_cpu_cap(c, X86_FEATURE_VMX);
- clear_sgx_caps();
+ clear_cpu_cap(c, X86_FEATURE_SGX);
return;
}
- /*
- * Enable SGX if and only if the kernel supports SGX and Launch Control
- * is supported, i.e. disable SGX if the LE hash MSRs can't be written.
- */
- enable_sgx = cpu_has(c, X86_FEATURE_SGX) &&
- cpu_has(c, X86_FEATURE_SGX_LC) &&
- IS_ENABLED(CONFIG_X86_SGX);
+ enable_vmx = cpu_has(c, X86_FEATURE_VMX) &&
+ IS_ENABLED(CONFIG_KVM_INTEL);
+
+ if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) {
+ /*
+ * Separate out SGX driver enabling from KVM. This allows KVM
+ * guests to use SGX even if the kernel SGX driver refuses to
+ * use it. This happens if flexible Launch Control is not
+ * available.
+ */
+ enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC);
+ enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM);
+ }
if (msr & FEAT_CTL_LOCKED)
goto update_caps;
* i.e. KVM is enabled, to avoid unnecessarily adding an attack vector
* for the kernel, e.g. using VMX to hide malicious code.
*/
- if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) {
+ if (enable_vmx) {
msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
if (tboot)
msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX;
}
- if (enable_sgx)
- msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED;
+ if (enable_sgx_kvm || enable_sgx_driver) {
+ msr |= FEAT_CTL_SGX_ENABLED;
+ if (enable_sgx_driver)
+ msr |= FEAT_CTL_SGX_LC_ENABLED;
+ }
wrmsrl(MSR_IA32_FEAT_CTL, msr);
}
update_sgx:
- if (!(msr & FEAT_CTL_SGX_ENABLED) ||
- !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) {
- if (enable_sgx)
- pr_err_once("SGX disabled by BIOS\n");
- clear_sgx_caps();
+ if (!(msr & FEAT_CTL_SGX_ENABLED)) {
+ if (enable_sgx_kvm || enable_sgx_driver)
+ pr_err_once("SGX disabled by BIOS.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ return;
+ }
+
+ /*
+ * VMX feature bit may be cleared due to being disabled in BIOS,
+ * in which case SGX virtualization cannot be supported either.
+ */
+ if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) {
+ pr_err_once("SGX virtualization disabled due to lack of VMX.\n");
+ enable_sgx_kvm = 0;
+ }
+
+ if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) {
+ if (!enable_sgx_kvm) {
+ pr_err_once("SGX Launch Control is locked. Disable SGX.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ } else {
+ pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX_LC);
+ }
}
}
{ X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
{ X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
{ X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 },
+ { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 },
+ { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 },
{ X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
encl.o \
ioctl.o \
main.o
+obj-$(CONFIG_X86_SGX_KVM) += virt.o
+++ /dev/null
-/* SPDX-License-Identifier: GPL-2.0 */
-/**
- * Copyright(c) 2016-20 Intel Corporation.
- *
- * Contains data structures defined by the SGX architecture. Data structures
- * defined by the Linux software stack should not be placed here.
- */
-#ifndef _ASM_X86_SGX_ARCH_H
-#define _ASM_X86_SGX_ARCH_H
-
-#include <linux/bits.h>
-#include <linux/types.h>
-
-/* The SGX specific CPUID function. */
-#define SGX_CPUID 0x12
-/* EPC enumeration. */
-#define SGX_CPUID_EPC 2
-/* An invalid EPC section, i.e. the end marker. */
-#define SGX_CPUID_EPC_INVALID 0x0
-/* A valid EPC section. */
-#define SGX_CPUID_EPC_SECTION 0x1
-/* The bitmask for the EPC section type. */
-#define SGX_CPUID_EPC_MASK GENMASK(3, 0)
-
-/**
- * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
- * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not
- * been completed yet.
- * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's
- * public key does not match IA32_SGXLEPUBKEYHASH.
- * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received
- */
-enum sgx_return_code {
- SGX_NOT_TRACKED = 11,
- SGX_INVALID_EINITTOKEN = 16,
- SGX_UNMASKED_EVENT = 128,
-};
-
-/* The modulus size for 3072-bit RSA keys. */
-#define SGX_MODULUS_SIZE 384
-
-/**
- * enum sgx_miscselect - additional information to an SSA frame
- * %SGX_MISC_EXINFO: Report #PF or #GP to the SSA frame.
- *
- * Save State Area (SSA) is a stack inside the enclave used to store processor
- * state when an exception or interrupt occurs. This enum defines additional
- * information stored to an SSA frame.
- */
-enum sgx_miscselect {
- SGX_MISC_EXINFO = BIT(0),
-};
-
-#define SGX_MISC_RESERVED_MASK GENMASK_ULL(63, 1)
-
-#define SGX_SSA_GPRS_SIZE 184
-#define SGX_SSA_MISC_EXINFO_SIZE 16
-
-/**
- * enum sgx_attributes - the attributes field in &struct sgx_secs
- * %SGX_ATTR_INIT: Enclave can be entered (is initialized).
- * %SGX_ATTR_DEBUG: Allow ENCLS(EDBGRD) and ENCLS(EDBGWR).
- * %SGX_ATTR_MODE64BIT: Tell that this a 64-bit enclave.
- * %SGX_ATTR_PROVISIONKEY: Allow to use provisioning keys for remote
- * attestation.
- * %SGX_ATTR_KSS: Allow to use key separation and sharing (KSS).
- * %SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to
- * sign cryptographic tokens that can be passed to
- * EINIT as an authorization to run an enclave.
- */
-enum sgx_attribute {
- SGX_ATTR_INIT = BIT(0),
- SGX_ATTR_DEBUG = BIT(1),
- SGX_ATTR_MODE64BIT = BIT(2),
- SGX_ATTR_PROVISIONKEY = BIT(4),
- SGX_ATTR_EINITTOKENKEY = BIT(5),
- SGX_ATTR_KSS = BIT(7),
-};
-
-#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | BIT_ULL(6) | GENMASK_ULL(63, 8))
-
-/**
- * struct sgx_secs - SGX Enclave Control Structure (SECS)
- * @size: size of the address space
- * @base: base address of the address space
- * @ssa_frame_size: size of an SSA frame
- * @miscselect: additional information stored to an SSA frame
- * @attributes: attributes for enclave
- * @xfrm: XSave-Feature Request Mask (subset of XCR0)
- * @mrenclave: SHA256-hash of the enclave contents
- * @mrsigner: SHA256-hash of the public key used to sign the SIGSTRUCT
- * @config_id: a user-defined value that is used in key derivation
- * @isv_prod_id: a user-defined value that is used in key derivation
- * @isv_svn: a user-defined value that is used in key derivation
- * @config_svn: a user-defined value that is used in key derivation
- *
- * SGX Enclave Control Structure (SECS) is a special enclave page that is not
- * visible in the address space. In fact, this structure defines the address
- * range and other global attributes for the enclave and it is the first EPC
- * page created for any enclave. It is moved from a temporary buffer to an EPC
- * by the means of ENCLS[ECREATE] function.
- */
-struct sgx_secs {
- u64 size;
- u64 base;
- u32 ssa_frame_size;
- u32 miscselect;
- u8 reserved1[24];
- u64 attributes;
- u64 xfrm;
- u32 mrenclave[8];
- u8 reserved2[32];
- u32 mrsigner[8];
- u8 reserved3[32];
- u32 config_id[16];
- u16 isv_prod_id;
- u16 isv_svn;
- u16 config_svn;
- u8 reserved4[3834];
-} __packed;
-
-/**
- * enum sgx_tcs_flags - execution flags for TCS
- * %SGX_TCS_DBGOPTIN: If enabled allows single-stepping and breakpoints
- * inside an enclave. It is cleared by EADD but can
- * be set later with EDBGWR.
- */
-enum sgx_tcs_flags {
- SGX_TCS_DBGOPTIN = 0x01,
-};
-
-#define SGX_TCS_RESERVED_MASK GENMASK_ULL(63, 1)
-#define SGX_TCS_RESERVED_SIZE 4024
-
-/**
- * struct sgx_tcs - Thread Control Structure (TCS)
- * @state: used to mark an entered TCS
- * @flags: execution flags (cleared by EADD)
- * @ssa_offset: SSA stack offset relative to the enclave base
- * @ssa_index: the current SSA frame index (cleard by EADD)
- * @nr_ssa_frames: the number of frame in the SSA stack
- * @entry_offset: entry point offset relative to the enclave base
- * @exit_addr: address outside the enclave to exit on an exception or
- * interrupt
- * @fs_offset: offset relative to the enclave base to become FS
- * segment inside the enclave
- * @gs_offset: offset relative to the enclave base to become GS
- * segment inside the enclave
- * @fs_limit: size to become a new FS-limit (only 32-bit enclaves)
- * @gs_limit: size to become a new GS-limit (only 32-bit enclaves)
- *
- * Thread Control Structure (TCS) is an enclave page visible in its address
- * space that defines an entry point inside the enclave. A thread enters inside
- * an enclave by supplying address of TCS to ENCLU(EENTER). A TCS can be entered
- * by only one thread at a time.
- */
-struct sgx_tcs {
- u64 state;
- u64 flags;
- u64 ssa_offset;
- u32 ssa_index;
- u32 nr_ssa_frames;
- u64 entry_offset;
- u64 exit_addr;
- u64 fs_offset;
- u64 gs_offset;
- u32 fs_limit;
- u32 gs_limit;
- u8 reserved[SGX_TCS_RESERVED_SIZE];
-} __packed;
-
-/**
- * struct sgx_pageinfo - an enclave page descriptor
- * @addr: address of the enclave page
- * @contents: pointer to the page contents
- * @metadata: pointer either to a SECINFO or PCMD instance
- * @secs: address of the SECS page
- */
-struct sgx_pageinfo {
- u64 addr;
- u64 contents;
- u64 metadata;
- u64 secs;
-} __packed __aligned(32);
-
-
-/**
- * enum sgx_page_type - bits in the SECINFO flags defining the page type
- * %SGX_PAGE_TYPE_SECS: a SECS page
- * %SGX_PAGE_TYPE_TCS: a TCS page
- * %SGX_PAGE_TYPE_REG: a regular page
- * %SGX_PAGE_TYPE_VA: a VA page
- * %SGX_PAGE_TYPE_TRIM: a page in trimmed state
- */
-enum sgx_page_type {
- SGX_PAGE_TYPE_SECS,
- SGX_PAGE_TYPE_TCS,
- SGX_PAGE_TYPE_REG,
- SGX_PAGE_TYPE_VA,
- SGX_PAGE_TYPE_TRIM,
-};
-
-#define SGX_NR_PAGE_TYPES 5
-#define SGX_PAGE_TYPE_MASK GENMASK(7, 0)
-
-/**
- * enum sgx_secinfo_flags - the flags field in &struct sgx_secinfo
- * %SGX_SECINFO_R: allow read
- * %SGX_SECINFO_W: allow write
- * %SGX_SECINFO_X: allow execution
- * %SGX_SECINFO_SECS: a SECS page
- * %SGX_SECINFO_TCS: a TCS page
- * %SGX_SECINFO_REG: a regular page
- * %SGX_SECINFO_VA: a VA page
- * %SGX_SECINFO_TRIM: a page in trimmed state
- */
-enum sgx_secinfo_flags {
- SGX_SECINFO_R = BIT(0),
- SGX_SECINFO_W = BIT(1),
- SGX_SECINFO_X = BIT(2),
- SGX_SECINFO_SECS = (SGX_PAGE_TYPE_SECS << 8),
- SGX_SECINFO_TCS = (SGX_PAGE_TYPE_TCS << 8),
- SGX_SECINFO_REG = (SGX_PAGE_TYPE_REG << 8),
- SGX_SECINFO_VA = (SGX_PAGE_TYPE_VA << 8),
- SGX_SECINFO_TRIM = (SGX_PAGE_TYPE_TRIM << 8),
-};
-
-#define SGX_SECINFO_PERMISSION_MASK GENMASK_ULL(2, 0)
-#define SGX_SECINFO_PAGE_TYPE_MASK (SGX_PAGE_TYPE_MASK << 8)
-#define SGX_SECINFO_RESERVED_MASK ~(SGX_SECINFO_PERMISSION_MASK | \
- SGX_SECINFO_PAGE_TYPE_MASK)
-
-/**
- * struct sgx_secinfo - describes attributes of an EPC page
- * @flags: permissions and type
- *
- * Used together with ENCLS leaves that add or modify an EPC page to an
- * enclave to define page permissions and type.
- */
-struct sgx_secinfo {
- u64 flags;
- u8 reserved[56];
-} __packed __aligned(64);
-
-#define SGX_PCMD_RESERVED_SIZE 40
-
-/**
- * struct sgx_pcmd - Paging Crypto Metadata (PCMD)
- * @enclave_id: enclave identifier
- * @mac: MAC over PCMD, page contents and isvsvn
- *
- * PCMD is stored for every swapped page to the regular memory. When ELDU loads
- * the page back it recalculates the MAC by using a isvsvn number stored in a
- * VA page. Together these two structures bring integrity and rollback
- * protection.
- */
-struct sgx_pcmd {
- struct sgx_secinfo secinfo;
- u64 enclave_id;
- u8 reserved[SGX_PCMD_RESERVED_SIZE];
- u8 mac[16];
-} __packed __aligned(128);
-
-#define SGX_SIGSTRUCT_RESERVED1_SIZE 84
-#define SGX_SIGSTRUCT_RESERVED2_SIZE 20
-#define SGX_SIGSTRUCT_RESERVED3_SIZE 32
-#define SGX_SIGSTRUCT_RESERVED4_SIZE 12
-
-/**
- * struct sgx_sigstruct_header - defines author of the enclave
- * @header1: constant byte string
- * @vendor: must be either 0x0000 or 0x8086
- * @date: YYYYMMDD in BCD
- * @header2: costant byte string
- * @swdefined: software defined value
- */
-struct sgx_sigstruct_header {
- u64 header1[2];
- u32 vendor;
- u32 date;
- u64 header2[2];
- u32 swdefined;
- u8 reserved1[84];
-} __packed;
-
-/**
- * struct sgx_sigstruct_body - defines contents of the enclave
- * @miscselect: additional information stored to an SSA frame
- * @misc_mask: required miscselect in SECS
- * @attributes: attributes for enclave
- * @xfrm: XSave-Feature Request Mask (subset of XCR0)
- * @attributes_mask: required attributes in SECS
- * @xfrm_mask: required XFRM in SECS
- * @mrenclave: SHA256-hash of the enclave contents
- * @isvprodid: a user-defined value that is used in key derivation
- * @isvsvn: a user-defined value that is used in key derivation
- */
-struct sgx_sigstruct_body {
- u32 miscselect;
- u32 misc_mask;
- u8 reserved2[20];
- u64 attributes;
- u64 xfrm;
- u64 attributes_mask;
- u64 xfrm_mask;
- u8 mrenclave[32];
- u8 reserved3[32];
- u16 isvprodid;
- u16 isvsvn;
-} __packed;
-
-/**
- * struct sgx_sigstruct - an enclave signature
- * @header: defines author of the enclave
- * @modulus: the modulus of the public key
- * @exponent: the exponent of the public key
- * @signature: the signature calculated over the fields except modulus,
- * @body: defines contents of the enclave
- * @q1: a value used in RSA signature verification
- * @q2: a value used in RSA signature verification
- *
- * Header and body are the parts that are actual signed. The remaining fields
- * define the signature of the enclave.
- */
-struct sgx_sigstruct {
- struct sgx_sigstruct_header header;
- u8 modulus[SGX_MODULUS_SIZE];
- u32 exponent;
- u8 signature[SGX_MODULUS_SIZE];
- struct sgx_sigstruct_body body;
- u8 reserved4[12];
- u8 q1[SGX_MODULUS_SIZE];
- u8 q2[SGX_MODULUS_SIZE];
-} __packed;
-
-#define SGX_LAUNCH_TOKEN_SIZE 304
-
-#endif /* _ASM_X86_SGX_ARCH_H */
.get_unmapped_area = sgx_get_unmapped_area,
};
-const struct file_operations sgx_provision_fops = {
- .owner = THIS_MODULE,
-};
-
static struct miscdevice sgx_dev_enclave = {
.minor = MISC_DYNAMIC_MINOR,
.name = "sgx_enclave",
.fops = &sgx_encl_fops,
};
-static struct miscdevice sgx_dev_provision = {
- .minor = MISC_DYNAMIC_MINOR,
- .name = "sgx_provision",
- .nodename = "sgx_provision",
- .fops = &sgx_provision_fops,
-};
-
int __init sgx_drv_init(void)
{
unsigned int eax, ebx, ecx, edx;
if (ret)
return ret;
- ret = misc_register(&sgx_dev_provision);
- if (ret) {
- misc_deregister(&sgx_dev_enclave);
- return ret;
- }
-
return 0;
}
#include <linux/shmem_fs.h>
#include <linux/suspend.h>
#include <linux/sched/mm.h>
-#include "arch.h"
+#include <asm/sgx.h>
#include "encl.h"
#include "encls.h"
#include "sgx.h"
ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
if (ret) {
- sgx_free_epc_page(epc_page);
+ sgx_encl_free_epc_page(epc_page);
return ERR_PTR(ret);
}
if (sgx_unmark_page_reclaimable(entry->epc_page))
continue;
- sgx_free_epc_page(entry->epc_page);
+ sgx_encl_free_epc_page(entry->epc_page);
encl->secs_child_cnt--;
entry->epc_page = NULL;
}
xa_destroy(&encl->page_array);
if (!encl->secs_child_cnt && encl->secs.epc_page) {
- sgx_free_epc_page(encl->secs.epc_page);
+ sgx_encl_free_epc_page(encl->secs.epc_page);
encl->secs.epc_page = NULL;
}
va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
list);
list_del(&va_page->list);
- sgx_free_epc_page(va_page->epc_page);
+ sgx_encl_free_epc_page(va_page->epc_page);
kfree(va_page);
}
ret = __epa(sgx_get_epc_virt_addr(epc_page));
if (ret) {
WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
- sgx_free_epc_page(epc_page);
+ sgx_encl_free_epc_page(epc_page);
return ERR_PTR(-EFAULT);
}
return slot == SGX_VA_SLOT_COUNT;
}
+
+/**
+ * sgx_encl_free_epc_page - free an EPC page assigned to an enclave
+ * @page: EPC page to be freed
+ *
+ * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and
+ * only upon success, it puts the page back to free page list. Otherwise, it
+ * gives a WARNING to indicate page is leaked.
+ */
+void sgx_encl_free_epc_page(struct sgx_epc_page *page)
+{
+ int ret;
+
+ WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+
+ ret = __eremove(sgx_get_epc_virt_addr(page));
+ if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret))
+ return;
+
+ sgx_free_epc_page(page);
+}
unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page);
void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset);
bool sgx_va_page_full(struct sgx_va_page *va_page);
+void sgx_encl_free_epc_page(struct sgx_epc_page *page);
#endif /* _X86_ENCL_H */
#include <asm/traps.h>
#include "sgx.h"
-enum sgx_encls_function {
- ECREATE = 0x00,
- EADD = 0x01,
- EINIT = 0x02,
- EREMOVE = 0x03,
- EDGBRD = 0x04,
- EDGBWR = 0x05,
- EEXTEND = 0x06,
- ELDU = 0x08,
- EBLOCK = 0x09,
- EPA = 0x0A,
- EWB = 0x0B,
- ETRACK = 0x0C,
-};
-
/**
* ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr
*
} while (0); \
}
+/*
+ * encls_faulted() - Check if an ENCLS leaf faulted given an error code
+ * @ret: the return value of an ENCLS leaf function call
+ *
+ * Return:
+ * - true: ENCLS leaf faulted.
+ * - false: Otherwise.
+ */
+static inline bool encls_faulted(int ret)
+{
+ return ret & ENCLS_FAULT_FLAG;
+}
+
/**
* encls_failed() - Check if an ENCLS function failed
* @ret: the return value of an ENCLS function call
*/
static inline bool encls_failed(int ret)
{
- if (ret & ENCLS_FAULT_FLAG)
+ if (encls_faulted(ret))
return ENCLS_TRAPNR(ret) != X86_TRAP_PF;
return !!ret;
/* Copyright(c) 2016-20 Intel Corporation. */
#include <asm/mman.h>
+#include <asm/sgx.h>
#include <linux/mman.h>
#include <linux/delay.h>
#include <linux/file.h>
encl->page_cnt--;
if (va_page) {
- sgx_free_epc_page(va_page->epc_page);
+ sgx_encl_free_epc_page(va_page->epc_page);
list_del(&va_page->list);
kfree(va_page);
}
return 0;
err_out:
- sgx_free_epc_page(encl->secs.epc_page);
+ sgx_encl_free_epc_page(encl->secs.epc_page);
encl->secs.epc_page = NULL;
err_out_backing:
mmap_read_unlock(current->mm);
err_out_free:
- sgx_free_epc_page(epc_page);
+ sgx_encl_free_epc_page(epc_page);
kfree(encl_page);
return ret;
void *token)
{
u64 mrsigner[4];
- int i, j, k;
+ int i, j;
void *addr;
int ret;
preempt_disable();
- for (k = 0; k < 4; k++)
- wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]);
+ sgx_update_lepubkeyhash(mrsigner);
ret = __einit(sigstruct, token, addr);
}
}
- if (ret & ENCLS_FAULT_FLAG) {
+ if (encls_faulted(ret)) {
if (encls_failed(ret))
ENCLS_WARN(ret, "EINIT");
{
struct sgx_sigstruct *sigstruct;
struct sgx_enclave_init init_arg;
- struct page *initp_page;
void *token;
int ret;
if (copy_from_user(&init_arg, arg, sizeof(init_arg)))
return -EFAULT;
- initp_page = alloc_page(GFP_KERNEL);
- if (!initp_page)
+ /*
+ * 'sigstruct' must be on a page boundary and 'token' on a 512 byte
+ * boundary. kmalloc() will give this alignment when allocating
+ * PAGE_SIZE bytes.
+ */
+ sigstruct = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!sigstruct)
return -ENOMEM;
- sigstruct = kmap(initp_page);
token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2);
memset(token, 0, SGX_LAUNCH_TOKEN_SIZE);
ret = sgx_encl_init(encl, sigstruct, token);
out:
- kunmap(initp_page);
- __free_page(initp_page);
+ kfree(sigstruct);
return ret;
}
static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg)
{
struct sgx_enclave_provision params;
- struct file *file;
if (copy_from_user(¶ms, arg, sizeof(params)))
return -EFAULT;
- file = fget(params.fd);
- if (!file)
- return -EINVAL;
-
- if (file->f_op != &sgx_provision_fops) {
- fput(file);
- return -EINVAL;
- }
-
- encl->attributes_mask |= SGX_ATTR_PROVISIONKEY;
-
- fput(file);
- return 0;
+ return sgx_set_attribute(&encl->attributes_mask, params.fd);
}
long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
// SPDX-License-Identifier: GPL-2.0
/* Copyright(c) 2016-20 Intel Corporation. */
+#include <linux/file.h>
#include <linux/freezer.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
+#include <linux/miscdevice.h>
#include <linux/pagemap.h>
#include <linux/ratelimit.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
+#include <asm/sgx.h>
#include "driver.h"
#include "encl.h"
#include "encls.h"
* with sgx_reclaimer_lock acquired.
*/
static LIST_HEAD(sgx_active_page_list);
-
static DEFINE_SPINLOCK(sgx_reclaimer_lock);
+/* The free page list lock protected variables prepend the lock. */
+static unsigned long sgx_nr_free_pages;
+
+/* Nodes with one or more EPC sections. */
+static nodemask_t sgx_numa_mask;
+
+/*
+ * Array with one list_head for each possible NUMA node. Each
+ * list contains all the sgx_epc_section's which are on that
+ * node.
+ */
+static struct sgx_numa_node *sgx_numa_nodes;
+
+static LIST_HEAD(sgx_dirty_page_list);
+
/*
- * Reset dirty EPC pages to uninitialized state. Laundry can be left with SECS
- * pages whose child pages blocked EREMOVE.
+ * Reset post-kexec EPC pages to the uninitialized state. The pages are removed
+ * from the input list, and made available for the page allocator. SECS pages
+ * prepending their children in the input list are left intact.
*/
-static void sgx_sanitize_section(struct sgx_epc_section *section)
+static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
{
struct sgx_epc_page *page;
LIST_HEAD(dirty);
int ret;
- /* init_laundry_list is thread-local, no need for a lock: */
- while (!list_empty(§ion->init_laundry_list)) {
+ /* dirty_page_list is thread-local, no need for a lock: */
+ while (!list_empty(dirty_page_list)) {
if (kthread_should_stop())
return;
- /* needed for access to ->page_list: */
- spin_lock(§ion->lock);
-
- page = list_first_entry(§ion->init_laundry_list,
- struct sgx_epc_page, list);
+ page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
ret = __eremove(sgx_get_epc_virt_addr(page));
- if (!ret)
- list_move(&page->list, §ion->page_list);
- else
+ if (!ret) {
+ /*
+ * page is now sanitized. Make it available via the SGX
+ * page allocator:
+ */
+ list_del(&page->list);
+ sgx_free_epc_page(page);
+ } else {
+ /* The page is not yet clean - move to the dirty list. */
list_move_tail(&page->list, &dirty);
-
- spin_unlock(§ion->lock);
+ }
cond_resched();
}
- list_splice(&dirty, §ion->init_laundry_list);
+ list_splice(&dirty, dirty_page_list);
}
static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
- sgx_free_epc_page(encl->secs.epc_page);
+ sgx_encl_free_epc_page(encl->secs.epc_page);
encl->secs.epc_page = NULL;
sgx_encl_put_backing(&secs_backing, true);
struct sgx_epc_section *section;
struct sgx_encl_page *encl_page;
struct sgx_epc_page *epc_page;
+ struct sgx_numa_node *node;
pgoff_t page_index;
int cnt = 0;
int ret;
epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
section = &sgx_epc_sections[epc_page->section];
- spin_lock(§ion->lock);
- list_add_tail(&epc_page->list, §ion->page_list);
- section->free_cnt++;
- spin_unlock(§ion->lock);
- }
-}
-
-static unsigned long sgx_nr_free_pages(void)
-{
- unsigned long cnt = 0;
- int i;
-
- for (i = 0; i < sgx_nr_epc_sections; i++)
- cnt += sgx_epc_sections[i].free_cnt;
+ node = section->node;
- return cnt;
+ spin_lock(&node->lock);
+ list_add_tail(&epc_page->list, &node->free_page_list);
+ sgx_nr_free_pages++;
+ spin_unlock(&node->lock);
+ }
}
static bool sgx_should_reclaim(unsigned long watermark)
{
- return sgx_nr_free_pages() < watermark &&
- !list_empty(&sgx_active_page_list);
+ return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list);
}
static int ksgxd(void *p)
{
- int i;
-
set_freezable();
/*
* Sanitize pages in order to recover from kexec(). The 2nd pass is
* required for SECS pages, whose child pages blocked EREMOVE.
*/
- for (i = 0; i < sgx_nr_epc_sections; i++)
- sgx_sanitize_section(&sgx_epc_sections[i]);
-
- for (i = 0; i < sgx_nr_epc_sections; i++) {
- sgx_sanitize_section(&sgx_epc_sections[i]);
+ __sgx_sanitize_pages(&sgx_dirty_page_list);
+ __sgx_sanitize_pages(&sgx_dirty_page_list);
- /* Should never happen. */
- if (!list_empty(&sgx_epc_sections[i].init_laundry_list))
- WARN(1, "EPC section %d has unsanitized pages.\n", i);
- }
+ /* sanity check: */
+ WARN_ON(!list_empty(&sgx_dirty_page_list));
while (!kthread_should_stop()) {
if (try_to_freeze())
return true;
}
-static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section)
+static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
{
- struct sgx_epc_page *page;
+ struct sgx_numa_node *node = &sgx_numa_nodes[nid];
+ struct sgx_epc_page *page = NULL;
- spin_lock(§ion->lock);
+ spin_lock(&node->lock);
- if (list_empty(§ion->page_list)) {
- spin_unlock(§ion->lock);
+ if (list_empty(&node->free_page_list)) {
+ spin_unlock(&node->lock);
return NULL;
}
- page = list_first_entry(§ion->page_list, struct sgx_epc_page, list);
+ page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
list_del_init(&page->list);
- section->free_cnt--;
+ sgx_nr_free_pages--;
+
+ spin_unlock(&node->lock);
- spin_unlock(§ion->lock);
return page;
}
/**
* __sgx_alloc_epc_page() - Allocate an EPC page
*
- * Iterate through EPC sections and borrow a free EPC page to the caller. When a
- * page is no longer needed it must be released with sgx_free_epc_page().
+ * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start
+ * from the NUMA node, where the caller is executing.
*
* Return:
- * an EPC page,
- * -errno on error
+ * - an EPC page: A borrowed EPC pages were available.
+ * - NULL: Out of EPC pages.
*/
struct sgx_epc_page *__sgx_alloc_epc_page(void)
{
- struct sgx_epc_section *section;
struct sgx_epc_page *page;
- int i;
+ int nid_of_current = numa_node_id();
+ int nid = nid_of_current;
- for (i = 0; i < sgx_nr_epc_sections; i++) {
- section = &sgx_epc_sections[i];
+ if (node_isset(nid_of_current, sgx_numa_mask)) {
+ page = __sgx_alloc_epc_page_from_node(nid_of_current);
+ if (page)
+ return page;
+ }
+
+ /* Fall back to the non-local NUMA nodes: */
+ while (true) {
+ nid = next_node_in(nid, sgx_numa_mask);
+ if (nid == nid_of_current)
+ break;
- page = __sgx_alloc_epc_page_from_section(section);
+ page = __sgx_alloc_epc_page_from_node(nid);
if (page)
return page;
}
* sgx_free_epc_page() - Free an EPC page
* @page: an EPC page
*
- * Call EREMOVE for an EPC page and insert it back to the list of free pages.
+ * Put the EPC page back to the list of free pages. It's the caller's
+ * responsibility to make sure that the page is in uninitialized state. In other
+ * words, do EREMOVE, EWB or whatever operation is necessary before calling
+ * this function.
*/
void sgx_free_epc_page(struct sgx_epc_page *page)
{
struct sgx_epc_section *section = &sgx_epc_sections[page->section];
- int ret;
+ struct sgx_numa_node *node = section->node;
- WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+ spin_lock(&node->lock);
- ret = __eremove(sgx_get_epc_virt_addr(page));
- if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret))
- return;
+ list_add_tail(&page->list, &node->free_page_list);
+ sgx_nr_free_pages++;
- spin_lock(§ion->lock);
- list_add_tail(&page->list, §ion->page_list);
- section->free_cnt++;
- spin_unlock(§ion->lock);
+ spin_unlock(&node->lock);
}
static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
}
section->phys_addr = phys_addr;
- spin_lock_init(§ion->lock);
- INIT_LIST_HEAD(§ion->page_list);
- INIT_LIST_HEAD(§ion->init_laundry_list);
for (i = 0; i < nr_pages; i++) {
section->pages[i].section = index;
section->pages[i].flags = 0;
section->pages[i].owner = NULL;
- list_add_tail(§ion->pages[i].list, §ion->init_laundry_list);
+ list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list);
}
- section->free_cnt = nr_pages;
return true;
}
{
u32 eax, ebx, ecx, edx, type;
u64 pa, size;
+ int nid;
int i;
+ sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
+ if (!sgx_numa_nodes)
+ return false;
+
for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
break;
}
+ nid = numa_map_to_online_node(phys_to_target_node(pa));
+ if (nid == NUMA_NO_NODE) {
+ /* The physical address is already printed above. */
+ pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
+ nid = 0;
+ }
+
+ if (!node_isset(nid, sgx_numa_mask)) {
+ spin_lock_init(&sgx_numa_nodes[nid].lock);
+ INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
+ node_set(nid, sgx_numa_mask);
+ }
+
+ sgx_epc_sections[i].node = &sgx_numa_nodes[nid];
+
sgx_nr_epc_sections++;
}
return true;
}
+/*
+ * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
+ * Bare-metal driver requires to update them to hash of enclave's signer
+ * before EINIT. KVM needs to update them to guest's virtual MSR values
+ * before doing EINIT from guest.
+ */
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
+{
+ int i;
+
+ WARN_ON_ONCE(preemptible());
+
+ for (i = 0; i < 4; i++)
+ wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
+}
+
+const struct file_operations sgx_provision_fops = {
+ .owner = THIS_MODULE,
+};
+
+static struct miscdevice sgx_dev_provision = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_provision",
+ .nodename = "sgx_provision",
+ .fops = &sgx_provision_fops,
+};
+
+/**
+ * sgx_set_attribute() - Update allowed attributes given file descriptor
+ * @allowed_attributes: Pointer to allowed enclave attributes
+ * @attribute_fd: File descriptor for specific attribute
+ *
+ * Append enclave attribute indicated by file descriptor to allowed
+ * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by
+ * /dev/sgx_provision is supported.
+ *
+ * Return:
+ * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes
+ * -EINVAL: Invalid, or not supported file descriptor
+ */
+int sgx_set_attribute(unsigned long *allowed_attributes,
+ unsigned int attribute_fd)
+{
+ struct file *file;
+
+ file = fget(attribute_fd);
+ if (!file)
+ return -EINVAL;
+
+ if (file->f_op != &sgx_provision_fops) {
+ fput(file);
+ return -EINVAL;
+ }
+
+ *allowed_attributes |= SGX_ATTR_PROVISIONKEY;
+
+ fput(file);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sgx_set_attribute);
+
static int __init sgx_init(void)
{
int ret;
goto err_page_cache;
}
- ret = sgx_drv_init();
+ ret = misc_register(&sgx_dev_provision);
if (ret)
goto err_kthread;
+ /*
+ * Always try to initialize the native *and* KVM drivers.
+ * The KVM driver is less picky than the native one and
+ * can function if the native one is not supported on the
+ * current system or fails to initialize.
+ *
+ * Error out only if both fail to initialize.
+ */
+ ret = sgx_drv_init();
+
+ if (sgx_vepc_init() && ret)
+ goto err_provision;
+
return 0;
+err_provision:
+ misc_deregister(&sgx_dev_provision);
+
err_kthread:
kthread_stop(ksgxd_tsk);
#include <linux/rwsem.h>
#include <linux/types.h>
#include <asm/asm.h>
-#include "arch.h"
+#include <asm/sgx.h>
#undef pr_fmt
#define pr_fmt(fmt) "sgx: " fmt
+#define EREMOVE_ERROR_MESSAGE \
+ "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \
+ "Refer to Documentation/x86/sgx.rst for more information."
+
#define SGX_MAX_EPC_SECTIONS 8
#define SGX_EEXTEND_BLOCK_SIZE 256
#define SGX_NR_TO_SCAN 16
struct list_head list;
};
+/*
+ * Contains the tracking data for NUMA nodes having EPC pages. Most importantly,
+ * the free page list local to the node is stored here.
+ */
+struct sgx_numa_node {
+ struct list_head free_page_list;
+ spinlock_t lock;
+};
+
/*
* The firmware can define multiple chunks of EPC to the different areas of the
* physical memory e.g. for memory areas of the each node. This structure is
* used to store EPC pages for one EPC section and virtual memory area where
* the pages have been mapped.
- *
- * 'lock' must be held before accessing 'page_list' or 'free_cnt'.
*/
struct sgx_epc_section {
unsigned long phys_addr;
void *virt_addr;
struct sgx_epc_page *pages;
-
- spinlock_t lock;
- struct list_head page_list;
- unsigned long free_cnt;
-
- /*
- * Pages which need EREMOVE run on them before they can be
- * used. Only safe to be accessed in ksgxd and init code.
- * Not protected by locks.
- */
- struct list_head init_laundry_list;
+ struct sgx_numa_node *node;
};
extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
int sgx_unmark_page_reclaimable(struct sgx_epc_page *page);
struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim);
+#ifdef CONFIG_X86_SGX_KVM
+int __init sgx_vepc_init(void);
+#else
+static inline int __init sgx_vepc_init(void)
+{
+ return -ENODEV;
+}
+#endif
+
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash);
+
#endif /* _X86_SGX_H */
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Device driver to expose SGX enclave memory to KVM guests.
+ *
+ * Copyright(c) 2021 Intel Corporation.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+#include <asm/sgx.h>
+#include <uapi/asm/sgx.h>
+
+#include "encls.h"
+#include "sgx.h"
+
+struct sgx_vepc {
+ struct xarray page_array;
+ struct mutex lock;
+};
+
+/*
+ * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
+ * virtual EPC instances, and the lock to protect it.
+ */
+static struct mutex zombie_secs_pages_lock;
+static struct list_head zombie_secs_pages;
+
+static int __sgx_vepc_fault(struct sgx_vepc *vepc,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ struct sgx_epc_page *epc_page;
+ unsigned long index, pfn;
+ int ret;
+
+ WARN_ON(!mutex_is_locked(&vepc->lock));
+
+ /* Calculate index of EPC page in virtual EPC's page_array */
+ index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
+
+ epc_page = xa_load(&vepc->page_array, index);
+ if (epc_page)
+ return 0;
+
+ epc_page = sgx_alloc_epc_page(vepc, false);
+ if (IS_ERR(epc_page))
+ return PTR_ERR(epc_page);
+
+ ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
+ if (ret)
+ goto err_free;
+
+ pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
+
+ ret = vmf_insert_pfn(vma, addr, pfn);
+ if (ret != VM_FAULT_NOPAGE) {
+ ret = -EFAULT;
+ goto err_delete;
+ }
+
+ return 0;
+
+err_delete:
+ xa_erase(&vepc->page_array, index);
+err_free:
+ sgx_free_epc_page(epc_page);
+ return ret;
+}
+
+static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct sgx_vepc *vepc = vma->vm_private_data;
+ int ret;
+
+ mutex_lock(&vepc->lock);
+ ret = __sgx_vepc_fault(vepc, vma, vmf->address);
+ mutex_unlock(&vepc->lock);
+
+ if (!ret)
+ return VM_FAULT_NOPAGE;
+
+ if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
+ mmap_read_unlock(vma->vm_mm);
+ return VM_FAULT_RETRY;
+ }
+
+ return VM_FAULT_SIGBUS;
+}
+
+static const struct vm_operations_struct sgx_vepc_vm_ops = {
+ .fault = sgx_vepc_fault,
+};
+
+static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct sgx_vepc *vepc = file->private_data;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ vma->vm_ops = &sgx_vepc_vm_ops;
+ /* Don't copy VMA in fork() */
+ vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
+ vma->vm_private_data = vepc;
+
+ return 0;
+}
+
+static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+{
+ int ret;
+
+ /*
+ * Take a previously guest-owned EPC page and return it to the
+ * general EPC page pool.
+ *
+ * Guests can not be trusted to have left this page in a good
+ * state, so run EREMOVE on the page unconditionally. In the
+ * case that a guest properly EREMOVE'd this page, a superfluous
+ * EREMOVE is harmless.
+ */
+ ret = __eremove(sgx_get_epc_virt_addr(epc_page));
+ if (ret) {
+ /*
+ * Only SGX_CHILD_PRESENT is expected, which is because of
+ * EREMOVE'ing an SECS still with child, in which case it can
+ * be handled by EREMOVE'ing the SECS again after all pages in
+ * virtual EPC have been EREMOVE'd. See comments in below in
+ * sgx_vepc_release().
+ *
+ * The user of virtual EPC (KVM) needs to guarantee there's no
+ * logical processor is still running in the enclave in guest,
+ * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
+ * handled here.
+ */
+ WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
+ ret, ret);
+ return ret;
+ }
+
+ sgx_free_epc_page(epc_page);
+
+ return 0;
+}
+
+static int sgx_vepc_release(struct inode *inode, struct file *file)
+{
+ struct sgx_vepc *vepc = file->private_data;
+ struct sgx_epc_page *epc_page, *tmp, *entry;
+ unsigned long index;
+
+ LIST_HEAD(secs_pages);
+
+ xa_for_each(&vepc->page_array, index, entry) {
+ /*
+ * Remove all normal, child pages. sgx_vepc_free_page()
+ * will fail if EREMOVE fails, but this is OK and expected on
+ * SECS pages. Those can only be EREMOVE'd *after* all their
+ * child pages. Retries below will clean them up.
+ */
+ if (sgx_vepc_free_page(entry))
+ continue;
+
+ xa_erase(&vepc->page_array, index);
+ }
+
+ /*
+ * Retry EREMOVE'ing pages. This will clean up any SECS pages that
+ * only had children in this 'epc' area.
+ */
+ xa_for_each(&vepc->page_array, index, entry) {
+ epc_page = entry;
+ /*
+ * An EREMOVE failure here means that the SECS page still
+ * has children. But, since all children in this 'sgx_vepc'
+ * have been removed, the SECS page must have a child on
+ * another instance.
+ */
+ if (sgx_vepc_free_page(epc_page))
+ list_add_tail(&epc_page->list, &secs_pages);
+
+ xa_erase(&vepc->page_array, index);
+ }
+
+ /*
+ * SECS pages are "pinned" by child pages, and "unpinned" once all
+ * children have been EREMOVE'd. A child page in this instance
+ * may have pinned an SECS page encountered in an earlier release(),
+ * creating a zombie. Since some children were EREMOVE'd above,
+ * try to EREMOVE all zombies in the hopes that one was unpinned.
+ */
+ mutex_lock(&zombie_secs_pages_lock);
+ list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
+ /*
+ * Speculatively remove the page from the list of zombies,
+ * if the page is successfully EREMOVE'd it will be added to
+ * the list of free pages. If EREMOVE fails, throw the page
+ * on the local list, which will be spliced on at the end.
+ */
+ list_del(&epc_page->list);
+
+ if (sgx_vepc_free_page(epc_page))
+ list_add_tail(&epc_page->list, &secs_pages);
+ }
+
+ if (!list_empty(&secs_pages))
+ list_splice_tail(&secs_pages, &zombie_secs_pages);
+ mutex_unlock(&zombie_secs_pages_lock);
+
+ kfree(vepc);
+
+ return 0;
+}
+
+static int sgx_vepc_open(struct inode *inode, struct file *file)
+{
+ struct sgx_vepc *vepc;
+
+ vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);
+ if (!vepc)
+ return -ENOMEM;
+ mutex_init(&vepc->lock);
+ xa_init(&vepc->page_array);
+
+ file->private_data = vepc;
+
+ return 0;
+}
+
+static const struct file_operations sgx_vepc_fops = {
+ .owner = THIS_MODULE,
+ .open = sgx_vepc_open,
+ .release = sgx_vepc_release,
+ .mmap = sgx_vepc_mmap,
+};
+
+static struct miscdevice sgx_vepc_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_vepc",
+ .nodename = "sgx_vepc",
+ .fops = &sgx_vepc_fops,
+};
+
+int __init sgx_vepc_init(void)
+{
+ /* SGX virtualization requires KVM to work */
+ if (!cpu_feature_enabled(X86_FEATURE_VMX))
+ return -ENODEV;
+
+ INIT_LIST_HEAD(&zombie_secs_pages);
+ mutex_init(&zombie_secs_pages_lock);
+
+ return misc_register(&sgx_vepc_dev);
+}
+
+/**
+ * sgx_virt_ecreate() - Run ECREATE on behalf of guest
+ * @pageinfo: Pointer to PAGEINFO structure
+ * @secs: Userspace pointer to SECS page
+ * @trapnr: trap number injected to guest in case of ECREATE error
+ *
+ * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
+ * of enforcing policies of guest's enclaves, and return the trap number
+ * which should be injected to guest in case of any ECREATE error.
+ *
+ * Return:
+ * - 0: ECREATE was successful.
+ * - <0: on error.
+ */
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+ int *trapnr)
+{
+ int ret;
+
+ /*
+ * @secs is an untrusted, userspace-provided address. It comes from
+ * KVM and is assumed to be a valid pointer which points somewhere in
+ * userspace. This can fault and call SGX or other fault handlers when
+ * userspace mapping @secs doesn't exist.
+ *
+ * Add a WARN() to make sure @secs is already valid userspace pointer
+ * from caller (KVM), who should already have handled invalid pointer
+ * case (for instance, made by malicious guest). All other checks,
+ * such as alignment of @secs, are deferred to ENCLS itself.
+ */
+ if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
+ return -EINVAL;
+
+ __uaccess_begin();
+ ret = __ecreate(pageinfo, (void *)secs);
+ __uaccess_end();
+
+ if (encls_faulted(ret)) {
+ *trapnr = ENCLS_TRAPNR(ret);
+ return -EFAULT;
+ }
+
+ /* ECREATE doesn't return an error code, it faults or succeeds. */
+ WARN_ON_ONCE(ret);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
+
+static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs)
+{
+ int ret;
+
+ /*
+ * Make sure all userspace pointers from caller (KVM) are valid.
+ * All other checks deferred to ENCLS itself. Also see comment
+ * for @secs in sgx_virt_ecreate().
+ */
+#define SGX_EINITTOKEN_SIZE 304
+ if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
+ !access_ok(token, SGX_EINITTOKEN_SIZE) ||
+ !access_ok(secs, PAGE_SIZE)))
+ return -EINVAL;
+
+ __uaccess_begin();
+ ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
+ __uaccess_end();
+
+ return ret;
+}
+
+/**
+ * sgx_virt_einit() - Run EINIT on behalf of guest
+ * @sigstruct: Userspace pointer to SIGSTRUCT structure
+ * @token: Userspace pointer to EINITTOKEN structure
+ * @secs: Userspace pointer to SECS page
+ * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
+ * @trapnr: trap number injected to guest in case of EINIT error
+ *
+ * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
+ * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
+ * needs to update hardware values to guest's virtual MSR values in order to
+ * ensure EINIT is executed with expected hardware values.
+ *
+ * Return:
+ * - 0: EINIT was successful.
+ * - <0: on error.
+ */
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs, u64 *lepubkeyhash, int *trapnr)
+{
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+ ret = __sgx_virt_einit(sigstruct, token, secs);
+ } else {
+ preempt_disable();
+
+ sgx_update_lepubkeyhash(lepubkeyhash);
+
+ ret = __sgx_virt_einit(sigstruct, token, secs);
+ preempt_enable();
+ }
+
+ /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
+ if (ret == -EINVAL)
+ return ret;
+
+ if (encls_faulted(ret)) {
+ *trapnr = ENCLS_TRAPNR(ret);
+ return -EFAULT;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sgx_virt_einit);
}
}
+#ifdef CONFIG_SMP
+
+static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
+
static bool pv_tlb_flush_supported(void)
{
return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
kvm_para_has_feature(KVM_FEATURE_STEAL_TIME));
}
-static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
-
-#ifdef CONFIG_SMP
-
static bool pv_ipi_supported(void)
{
return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI);
}
}
+static void kvm_flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
+{
+ u8 state;
+ int cpu;
+ struct kvm_steal_time *src;
+ struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
+
+ cpumask_copy(flushmask, cpumask);
+ /*
+ * We have to call flush only on online vCPUs. And
+ * queue flush_on_enter for pre-empted vCPUs
+ */
+ for_each_cpu(cpu, flushmask) {
+ src = &per_cpu(steal_time, cpu);
+ state = READ_ONCE(src->preempted);
+ if ((state & KVM_VCPU_PREEMPTED)) {
+ if (try_cmpxchg(&src->preempted, &state,
+ state | KVM_VCPU_FLUSH_TLB))
+ __cpumask_clear_cpu(cpu, flushmask);
+ }
+ }
+
+ native_flush_tlb_others(flushmask, info);
+}
+
+static __init int kvm_alloc_cpumask(void)
+{
+ int cpu;
+
+ if (!kvm_para_available() || nopv)
+ return 0;
+
+ if (pv_tlb_flush_supported() || pv_ipi_supported())
+ for_each_possible_cpu(cpu) {
+ zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
+ GFP_KERNEL, cpu_to_node(cpu));
+ }
+
+ return 0;
+}
+arch_initcall(kvm_alloc_cpumask);
+
static void __init kvm_smp_prepare_boot_cpu(void)
{
/*
local_irq_enable();
return 0;
}
-#endif
-
-static void kvm_flush_tlb_others(const struct cpumask *cpumask,
- const struct flush_tlb_info *info)
-{
- u8 state;
- int cpu;
- struct kvm_steal_time *src;
- struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
-
- cpumask_copy(flushmask, cpumask);
- /*
- * We have to call flush only on online vCPUs. And
- * queue flush_on_enter for pre-empted vCPUs
- */
- for_each_cpu(cpu, flushmask) {
- src = &per_cpu(steal_time, cpu);
- state = READ_ONCE(src->preempted);
- if ((state & KVM_VCPU_PREEMPTED)) {
- if (try_cmpxchg(&src->preempted, &state,
- state | KVM_VCPU_FLUSH_TLB))
- __cpumask_clear_cpu(cpu, flushmask);
- }
- }
- native_flush_tlb_others(flushmask, info);
-}
+#endif
static void __init kvm_guest_init(void)
{
pv_ops.time.steal_clock = kvm_steal_clock;
}
- if (pv_tlb_flush_supported()) {
- pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
- pr_info("KVM setup pv remote TLB flush\n");
- }
-
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
apic_set_eoi_write(kvm_guest_apic_eoi_write);
}
#ifdef CONFIG_SMP
+ if (pv_tlb_flush_supported()) {
+ pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
+ pv_ops.mmu.tlb_remove_table = tlb_remove_table;
+ pr_info("KVM setup pv remote TLB flush\n");
+ }
+
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
if (pv_sched_yield_supported()) {
smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
static void __init kvm_apic_init(void)
{
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
if (pv_ipi_supported())
kvm_setup_pv_ipi();
#endif
}
arch_initcall(activate_jump_labels);
-static __init int kvm_alloc_cpumask(void)
-{
- int cpu;
- bool alloc = false;
-
- if (!kvm_para_available() || nopv)
- return 0;
-
- if (pv_tlb_flush_supported())
- alloc = true;
-
-#if defined(CONFIG_SMP)
- if (pv_ipi_supported())
- alloc = true;
-#endif
-
- if (alloc)
- for_each_possible_cpu(cpu) {
- zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
- GFP_KERNEL, cpu_to_node(cpu));
- }
-
- return 0;
-}
-arch_initcall(kvm_alloc_cpumask);
-
#ifdef CONFIG_PARAVIRT_SPINLOCKS
/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
To compile this as a module, choose M here: the module
will be called kvm-intel.
+config X86_SGX_KVM
+ bool "Software Guard eXtensions (SGX) Virtualization"
+ depends on X86_SGX && KVM_INTEL
+ help
+
+ Enables KVM guests to create SGX enclaves.
+
+ This includes support to expose "raw" unreclaimable enclave memory to
+ guests via a device node, e.g. /dev/sgx_vepc.
+
+ If unsure, say N.
+
config KVM_AMD
tristate "KVM for AMD processors support"
depends on KVM
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
+kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o
+
kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
obj-$(CONFIG_KVM) += kvm.o
#include <asm/processor.h>
#include <asm/user.h>
#include <asm/fpu/xstate.h>
+#include <asm/sgx.h>
#include "cpuid.h"
#include "lapic.h"
#include "mmu.h"
* Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
* aligned to sizeof(unsigned long) because it's not accessed via bitops.
*/
-u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
EXPORT_SYMBOL_GPL(kvm_cpu_caps);
static u32 xstate_required_size(u64 xstate_bv, bool compacted)
}
#define F feature_bit
+#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index)
vcpu->arch.guest_supported_xcr0 =
(best->eax | ((u64)best->edx << 32)) & supported_xcr0;
+ /*
+ * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
+ * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
+ * requested XCR0 value. The enclave's XFRM must be a subset of XCRO
+ * at the time of EENTER, thus adjust the allowed XFRM by the guest's
+ * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to
+ * '1' even on CPUs that don't support XSAVE.
+ */
+ best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1);
+ if (best) {
+ best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff;
+ best->edx &= vcpu->arch.guest_supported_xcr0 >> 32;
+ best->ecx |= XFEATURE_MASK_FPSSE;
+ }
+
kvm_update_pv_runtime(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
return r;
}
-static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+/* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */
+static __always_inline void __kvm_cpu_cap_mask(enum cpuid_leafs leaf)
{
const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32);
struct kvm_cpuid_entry2 entry;
reverse_cpuid_check(leaf);
- kvm_cpu_caps[leaf] &= mask;
cpuid_count(cpuid.function, cpuid.index,
&entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg);
}
+static __always_inline void kvm_cpu_cap_init_scattered(enum cpuid_leafs leaf, u32 mask)
+{
+ /* Use kvm_cpu_cap_mask for non-scattered leafs. */
+ BUILD_BUG_ON(leaf < NCAPINTS);
+
+ kvm_cpu_caps[leaf] = mask;
+
+ __kvm_cpu_cap_mask(leaf);
+}
+
+static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+{
+ /* Use kvm_cpu_cap_init_scattered for scattered leafs. */
+ BUILD_BUG_ON(leaf >= NCAPINTS);
+
+ kvm_cpu_caps[leaf] &= mask;
+
+ __kvm_cpu_cap_mask(leaf);
+}
+
void kvm_set_cpu_caps(void)
{
unsigned int f_nx = is_efer_nx() ? F(NX) : 0;
unsigned int f_gbpages = 0;
unsigned int f_lm = 0;
#endif
+ memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
- BUILD_BUG_ON(sizeof(kvm_cpu_caps) >
+ BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) >
sizeof(boot_cpu_data.x86_capability));
memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability,
- sizeof(kvm_cpu_caps));
+ sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)));
kvm_cpu_cap_mask(CPUID_1_ECX,
/*
);
kvm_cpu_cap_mask(CPUID_7_0_EBX,
- F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
+ F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
- F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/
+ F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
+ F(SGX_LC)
);
/* Set LA57 based on hardware capability. */
if (cpuid_ecx(7) & F(LA57))
F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES)
);
+ kvm_cpu_cap_init_scattered(CPUID_12_EAX,
+ SF(SGX1) | SF(SGX2)
+ );
+
kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
entry->edx = 0;
}
break;
+ case 0x12:
+ /* Intel SGX */
+ if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ /*
+ * Index 0: Sub-features, MISCSELECT (a.k.a extended features)
+ * and max enclave sizes. The SGX sub-features and MISCSELECT
+ * are restricted by kernel and KVM capabilities (like most
+ * feature flags), while enclave size is unrestricted.
+ */
+ cpuid_entry_override(entry, CPUID_12_EAX);
+ entry->ebx &= SGX_MISC_EXINFO;
+
+ entry = do_host_cpuid(array, function, 1);
+ if (!entry)
+ goto out;
+
+ /*
+ * Index 1: SECS.ATTRIBUTES. ATTRIBUTES are restricted a la
+ * feature flags. Advertise all supported flags, including
+ * privileged attributes that require explicit opt-in from
+ * userspace. ATTRIBUTES.XFRM is not adjusted as userspace is
+ * expected to derive it from supported XCR0.
+ */
+ entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT |
+ SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY |
+ SGX_ATTR_KSS;
+ entry->ebx &= 0;
+ break;
/* Intel PT */
case 0x14:
if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) {
#include <asm/processor.h>
#include <uapi/asm/kvm_para.h>
-extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+/*
+ * Hardware-defined CPUID leafs that are scattered in the kernel, but need to
+ * be directly used by KVM. Note, these word values conflict with the kernel's
+ * "bug" caps, but KVM doesn't use those.
+ */
+enum kvm_only_cpuid_leafs {
+ CPUID_12_EAX = NCAPINTS,
+ NR_KVM_CPU_CAPS,
+
+ NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
+};
+
+#define KVM_X86_FEATURE(w, f) ((w)*32 + (f))
+
+/* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */
+#define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0)
+#define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1)
+
+extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
void kvm_set_cpu_caps(void);
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
[CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
[CPUID_7_EDX] = { 7, 0, CPUID_EDX},
[CPUID_7_1_EAX] = { 7, 1, CPUID_EAX},
+ [CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX},
};
/*
BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
}
+/*
+ * Translate feature bits that are scattered in the kernel's cpufeatures word
+ * into KVM feature words that align with hardware's definitions.
+ */
+static __always_inline u32 __feature_translate(int x86_feature)
+{
+ if (x86_feature == X86_FEATURE_SGX1)
+ return KVM_X86_FEATURE_SGX1;
+ else if (x86_feature == X86_FEATURE_SGX2)
+ return KVM_X86_FEATURE_SGX2;
+
+ return x86_feature;
+}
+
+static __always_inline u32 __feature_leaf(int x86_feature)
+{
+ return __feature_translate(x86_feature) / 32;
+}
+
/*
* Retrieve the bit mask from an X86_FEATURE_* definition. Features contain
* the hardware defined bit number (stored in bits 4:0) and a software defined
*/
static __always_inline u32 __feature_bit(int x86_feature)
{
+ x86_feature = __feature_translate(x86_feature);
+
reverse_cpuid_check(x86_feature / 32);
return 1 << (x86_feature & 31);
}
static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature)
{
- unsigned int x86_leaf = x86_feature / 32;
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
reverse_cpuid_check(x86_leaf);
return reverse_cpuid[x86_leaf];
is_guest_vendor_hygon(best->ebx, best->ecx, best->edx));
}
+static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0, 0);
+ return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx);
+}
+
static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
{
- unsigned int x86_leaf = x86_feature / 32;
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
reverse_cpuid_check(x86_leaf);
kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
{
- unsigned int x86_leaf = x86_feature / 32;
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
reverse_cpuid_check(x86_leaf);
kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature)
{
- unsigned int x86_leaf = x86_feature / 32;
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
reverse_cpuid_check(x86_leaf);
return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature);
return;
if (is_guest_mode(vcpu)) {
- r = kvm_x86_ops.nested_ops->check_events(vcpu);
+ r = kvm_check_nested_events(vcpu);
if (r < 0)
return;
/*
return ((2ULL << (e - s)) - 1) << s;
}
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask);
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len);
+int kvm_mmu_load(struct kvm_vcpu *vcpu);
+void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE))
if (!VALID_PAGE(root_hpa))
return;
- static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa | kvm_get_active_pcid(vcpu),
- vcpu->arch.mmu->shadow_root_level);
+ static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa,
+ vcpu->arch.mmu->shadow_root_level);
}
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
* write-protects guest page to sync the guest modification, b) another one is
* used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences
* between these two sorts are:
- * 1) the first case clears SPTE_MMU_WRITEABLE bit.
+ * 1) the first case clears MMU-writable bit.
* 2) the first case requires flushing tlb immediately avoiding corrupting
* shadow page table between all vcpus so it should be in the protection of
* mmu-lock. And the another case does not need to flush tlb until returning
* So, there is the problem: the first case can meet the corrupted tlb caused
* by another case which write-protects pages but without flush tlb
* immediately. In order to making the first case be aware this problem we let
- * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit
- * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit.
+ * it flush tlb if we try to write-protect a spte whose MMU-writable bit
+ * is set, it works since another case never touches MMU-writable bit.
*
* Anyway, whenever a spte is updated (only permission and status bits are
- * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes
+ * changed) we need to check whether the spte with MMU-writable becomes
* readonly, if that happens, we need to flush tlb. Fortunately,
* mmu_spte_update() has already handled it perfectly.
*
- * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK:
+ * The rules to use MMU-writable and PT_WRITABLE_MASK:
* - if we want to see if it has writable tlb entry or if the spte can be
- * writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most
+ * writable on the mmu mapping, check MMU-writable, this is the most
* case, otherwise
* - if we fix page fault on the spte or do write-protection by dirty logging,
* check PT_WRITABLE_MASK.
#include <asm/memtype.h>
#include <asm/cmpxchg.h>
#include <asm/io.h>
+#include <asm/set_memory.h>
#include <asm/vmx.h>
#include <asm/kvm_page_track.h>
#include "trace.h"
static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
unsigned int access)
{
- u64 mask = make_mmio_spte(vcpu, gfn, access);
+ u64 spte = make_mmio_spte(vcpu, gfn, access);
- trace_mark_mmio_spte(sptep, gfn, mask);
- mmu_spte_set(sptep, mask);
+ trace_mark_mmio_spte(sptep, gfn, spte);
+ mmu_spte_set(sptep, spte);
}
static gfn_t get_mmio_spte_gfn(u64 spte)
return spte & shadow_mmio_access_mask;
}
-static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
- kvm_pfn_t pfn, unsigned int access)
-{
- if (unlikely(is_noslot_pfn(pfn))) {
- mark_mmio_spte(vcpu, sptep, gfn, access);
- return true;
- }
-
- return false;
-}
-
static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
{
u64 kvm_gen, spte_gen, gen;
* handling slots that are not large page aligned.
*/
static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
- struct kvm_memory_slot *slot,
- int level)
+ const struct kvm_memory_slot *slot, int level)
{
unsigned long idx;
rmap_printk("spte %p %llx\n", sptep, *sptep);
if (pt_protect)
- spte &= ~SPTE_MMU_WRITEABLE;
+ spte &= ~shadow_mmu_writable_mask;
spte = spte & ~PT_WRITABLE_MASK;
return mmu_spte_update(sptep, spte);
return flush;
}
-static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- unsigned long data)
+static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ pte_t unused)
{
return kvm_zap_rmapp(kvm, rmap_head, slot);
}
-static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- unsigned long data)
+static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ pte_t pte)
{
u64 *sptep;
struct rmap_iterator iter;
int need_flush = 0;
u64 new_spte;
- pte_t *ptep = (pte_t *)data;
kvm_pfn_t new_pfn;
- WARN_ON(pte_huge(*ptep));
- new_pfn = pte_pfn(*ptep);
+ WARN_ON(pte_huge(pte));
+ new_pfn = pte_pfn(pte);
restart:
for_each_rmap_spte(rmap_head, &iter, sptep) {
need_flush = 1;
- if (pte_write(*ptep)) {
+ if (pte_write(pte)) {
pte_list_remove(rmap_head, sptep);
goto restart;
} else {
slot_rmap_walk_okay(_iter_); \
slot_rmap_walk_next(_iter_))
-static __always_inline int
-kvm_handle_hva_range(struct kvm *kvm,
- unsigned long start,
- unsigned long end,
- unsigned long data,
- int (*handler)(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot,
- gfn_t gfn,
- int level,
- unsigned long data))
+typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int level, pte_t pte);
+
+static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
+ struct kvm_gfn_range *range,
+ rmap_handler_t handler)
{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
struct slot_rmap_walk_iterator iterator;
- int ret = 0;
- int i;
-
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- slots = __kvm_memslots(kvm, i);
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gfn_start, gfn_end;
+ bool ret = false;
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
- /*
- * {gfn(page) | page intersects with [hva_start, hva_end)} =
- * {gfn_start, gfn_start+1, ..., gfn_end-1}.
- */
- gfn_start = hva_to_gfn_memslot(hva_start, memslot);
- gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
- for_each_slot_rmap_range(memslot, PG_LEVEL_4K,
- KVM_MAX_HUGEPAGE_LEVEL,
- gfn_start, gfn_end - 1,
- &iterator)
- ret |= handler(kvm, iterator.rmap, memslot,
- iterator.gfn, iterator.level, data);
- }
- }
+ for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ range->start, range->end - 1, &iterator)
+ ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
+ iterator.level, range->pte);
return ret;
}
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- unsigned long data,
- int (*handler)(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot,
- gfn_t gfn, int level,
- unsigned long data))
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
-}
-
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
- unsigned flags)
-{
- int r;
+ bool flush;
- r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
+ flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
if (is_tdp_mmu_enabled(kvm))
- r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end);
+ flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
- return r;
+ return flush;
}
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- int r;
+ bool flush;
- r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
+ flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp);
if (is_tdp_mmu_enabled(kvm))
- r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte);
+ flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
- return r;
+ return flush;
}
-static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- unsigned long data)
+static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ pte_t unused)
{
u64 *sptep;
struct rmap_iterator iter;
for_each_rmap_spte(rmap_head, &iter, sptep)
young |= mmu_spte_age(sptep);
- trace_kvm_age_page(gfn, level, slot, young);
return young;
}
-static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn,
- int level, unsigned long data)
+static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int level, pte_t unused)
{
u64 *sptep;
struct rmap_iterator iter;
rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
- kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
+ kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
KVM_PAGES_PER_HPAGE(sp->role.level));
}
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- int young = false;
+ bool young;
+
+ young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp);
- young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
if (is_tdp_mmu_enabled(kvm))
- young |= kvm_tdp_mmu_age_hva_range(kvm, start, end);
+ young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
return young;
}
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- int young = false;
+ bool young;
+
+ young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp);
- young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
if (is_tdp_mmu_enabled(kvm))
- young |= kvm_tdp_mmu_test_age_hva(kvm, hva);
+ young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
return young;
}
kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
+ /*
+ * Note, this check is intentionally soft, it only guarantees that one
+ * page is available, while the caller may end up allocating as many as
+ * four pages, e.g. for PAE roots or for 5-level paging. Temporarily
+ * exceeding the (arbitrary by default) limit will not harm the host,
+ * being too agressive may unnecessarily kill the guest, and getting an
+ * exact count is far more trouble than it's worth, especially in the
+ * page fault paths.
+ */
if (!kvm_mmu_available_pages(vcpu->kvm))
return -ENOSPC;
return 0;
struct kvm_mmu_page *sp;
int ret;
- if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
- return 0;
-
sp = sptep_to_sp(sptep);
ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative,
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
+ if (unlikely(is_noslot_pfn(pfn))) {
+ mark_mmio_spte(vcpu, sptep, gfn, pte_access);
+ return RET_PF_EMULATE;
+ }
+
if (is_shadow_present_pte(*sptep)) {
/*
* If we overwrite a PTE page pointer with a 2MB PMD, unlink
kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
KVM_PAGES_PER_HPAGE(level));
- if (unlikely(is_mmio_spte(*sptep)))
- ret = RET_PF_EMULATE;
-
/*
* The fault is fully spurious if and only if the new SPTE and old SPTE
* are identical, and emulation is not required.
}
static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot)
{
unsigned long hva;
pte_t *pte;
return level;
}
-int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, kvm_pfn_t pfn, int max_level)
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn,
+ kvm_pfn_t pfn, int max_level)
{
struct kvm_lpage_info *linfo;
return true;
}
- if (unlikely(is_noslot_pfn(pfn)))
+ if (unlikely(is_noslot_pfn(pfn))) {
vcpu_cache_mmio_info(vcpu, gva, gfn,
access & shadow_mmio_access_mask);
+ /*
+ * If MMIO caching is disabled, emulate immediately without
+ * touching the shadow page tables as attempting to install an
+ * MMIO SPTE will just be an expensive nop.
+ */
+ if (unlikely(!shadow_mmio_value)) {
+ *ret_val = RET_PF_EMULATE;
+ return true;
+ }
+ }
return false;
}
if (!is_shadow_present_pte(spte))
break;
+ if (!is_shadow_present_pte(spte))
+ break;
+
sp = sptep_to_sp(iterator.sptep);
if (!is_last_spte(spte, sp->role.level))
break;
sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
- if (kvm_mmu_put_root(kvm, sp)) {
- if (is_tdp_mmu_page(sp))
- kvm_tdp_mmu_free_root(kvm, sp);
- else if (sp->role.invalid)
- kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
- }
+ if (is_tdp_mmu_page(sp))
+ kvm_tdp_mmu_put_root(kvm, sp, false);
+ else if (!--sp->root_count && sp->role.invalid)
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
*root_hpa = INVALID_PAGE;
}
if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
(mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list);
- } else {
- for (i = 0; i < 4; ++i)
- if (mmu->pae_root[i] != 0)
- mmu_free_root_page(kvm,
- &mmu->pae_root[i],
- &invalid_list);
- mmu->root_hpa = INVALID_PAGE;
+ } else if (mmu->pae_root) {
+ for (i = 0; i < 4; ++i) {
+ if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
+ continue;
+
+ mmu_free_root_page(kvm, &mmu->pae_root[i],
+ &invalid_list);
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
+ }
}
+ mmu->root_hpa = INVALID_PAGE;
mmu->root_pgd = 0;
}
{
struct kvm_mmu_page *sp;
- write_lock(&vcpu->kvm->mmu_lock);
-
- if (make_mmu_pages_available(vcpu)) {
- write_unlock(&vcpu->kvm->mmu_lock);
- return INVALID_PAGE;
- }
sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
++sp->root_count;
- write_unlock(&vcpu->kvm->mmu_lock);
return __pa(sp->spt);
}
static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
{
- u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u8 shadow_root_level = mmu->shadow_root_level;
hpa_t root;
unsigned i;
+ int r;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ r = make_mmu_pages_available(vcpu);
+ if (r < 0)
+ goto out_unlock;
if (is_tdp_mmu_enabled(vcpu->kvm)) {
root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
-
- if (!VALID_PAGE(root))
- return -ENOSPC;
- vcpu->arch.mmu->root_hpa = root;
+ mmu->root_hpa = root;
} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
- root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
- true);
-
- if (!VALID_PAGE(root))
- return -ENOSPC;
- vcpu->arch.mmu->root_hpa = root;
+ root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
+ mmu->root_hpa = root;
} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
+ if (WARN_ON_ONCE(!mmu->pae_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
for (i = 0; i < 4; ++i) {
- MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
+ WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
i << 30, PT32_ROOT_LEVEL, true);
- if (!VALID_PAGE(root))
- return -ENOSPC;
- vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
+ mmu->pae_root[i] = root | PT_PRESENT_MASK |
+ shadow_me_mask;
}
- vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
- } else
- BUG();
+ mmu->root_hpa = __pa(mmu->pae_root);
+ } else {
+ WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
+ r = -EIO;
+ goto out_unlock;
+ }
/* root_pgd is ignored for direct MMUs. */
- vcpu->arch.mmu->root_pgd = 0;
-
- return 0;
+ mmu->root_pgd = 0;
+out_unlock:
+ write_unlock(&vcpu->kvm->mmu_lock);
+ return r;
}
static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
{
- u64 pdptr, pm_mask;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u64 pdptrs[4], pm_mask;
gfn_t root_gfn, root_pgd;
hpa_t root;
- int i;
+ unsigned i;
+ int r;
- root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu);
+ root_pgd = mmu->get_guest_pgd(vcpu);
root_gfn = root_pgd >> PAGE_SHIFT;
if (mmu_check_root(vcpu, root_gfn))
return 1;
+ /*
+ * On SVM, reading PDPTRs might access guest memory, which might fault
+ * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock.
+ */
+ if (mmu->root_level == PT32E_ROOT_LEVEL) {
+ for (i = 0; i < 4; ++i) {
+ pdptrs[i] = mmu->get_pdptr(vcpu, i);
+ if (!(pdptrs[i] & PT_PRESENT_MASK))
+ continue;
+
+ if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
+ return 1;
+ }
+ }
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ r = make_mmu_pages_available(vcpu);
+ if (r < 0)
+ goto out_unlock;
+
/*
* Do we shadow a long mode page table? If so we need to
* write-protect the guests page table root.
*/
- if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
- MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa));
-
+ if (mmu->root_level >= PT64_ROOT_4LEVEL) {
root = mmu_alloc_root(vcpu, root_gfn, 0,
- vcpu->arch.mmu->shadow_root_level, false);
- if (!VALID_PAGE(root))
- return -ENOSPC;
- vcpu->arch.mmu->root_hpa = root;
+ mmu->shadow_root_level, false);
+ mmu->root_hpa = root;
goto set_root_pgd;
}
+ if (WARN_ON_ONCE(!mmu->pae_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
/*
* We shadow a 32 bit page table. This may be a legacy 2-level
* or a PAE 3-level page table. In either case we need to be aware that
* the shadow page table may be a PAE or a long mode page table.
*/
- pm_mask = PT_PRESENT_MASK;
- if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+ pm_mask = PT_PRESENT_MASK | shadow_me_mask;
+ if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
+ if (WARN_ON_ONCE(!mmu->lm_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
+ mmu->lm_root[0] = __pa(mmu->pae_root) | pm_mask;
+ }
+
for (i = 0; i < 4; ++i) {
- MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
- if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
- pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
- if (!(pdptr & PT_PRESENT_MASK)) {
- vcpu->arch.mmu->pae_root[i] = 0;
+ WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
+
+ if (mmu->root_level == PT32E_ROOT_LEVEL) {
+ if (!(pdptrs[i] & PT_PRESENT_MASK)) {
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
continue;
}
- root_gfn = pdptr >> PAGE_SHIFT;
- if (mmu_check_root(vcpu, root_gfn))
- return 1;
+ root_gfn = pdptrs[i] >> PAGE_SHIFT;
}
root = mmu_alloc_root(vcpu, root_gfn, i << 30,
PT32_ROOT_LEVEL, false);
- if (!VALID_PAGE(root))
- return -ENOSPC;
- vcpu->arch.mmu->pae_root[i] = root | pm_mask;
+ mmu->pae_root[i] = root | pm_mask;
}
- vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
+
+ if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+ mmu->root_hpa = __pa(mmu->lm_root);
+ else
+ mmu->root_hpa = __pa(mmu->pae_root);
+
+set_root_pgd:
+ mmu->root_pgd = root_pgd;
+out_unlock:
+ write_unlock(&vcpu->kvm->mmu_lock);
+
+ return 0;
+}
+
+static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u64 *lm_root, *pae_root;
/*
- * If we shadow a 32 bit page table with a long mode page
- * table we enter this path.
+ * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
+ * tables are allocated and initialized at root creation as there is no
+ * equivalent level in the guest's NPT to shadow. Allocate the tables
+ * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
*/
- if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
- if (vcpu->arch.mmu->lm_root == NULL) {
- /*
- * The additional page necessary for this is only
- * allocated on demand.
- */
+ if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL ||
+ mmu->shadow_root_level < PT64_ROOT_4LEVEL)
+ return 0;
- u64 *lm_root;
+ /*
+ * This mess only works with 4-level paging and needs to be updated to
+ * work with 5-level paging.
+ */
+ if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
+ return -EIO;
- lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
- if (lm_root == NULL)
- return 1;
+ if (mmu->pae_root && mmu->lm_root)
+ return 0;
- lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
+ /*
+ * The special roots should always be allocated in concert. Yell and
+ * bail if KVM ends up in a state where only one of the roots is valid.
+ */
+ if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->lm_root))
+ return -EIO;
- vcpu->arch.mmu->lm_root = lm_root;
- }
+ /*
+ * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
+ * doesn't need to be decrypted.
+ */
+ pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pae_root)
+ return -ENOMEM;
- vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
+ lm_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!lm_root) {
+ free_page((unsigned long)pae_root);
+ return -ENOMEM;
}
-set_root_pgd:
- vcpu->arch.mmu->root_pgd = root_pgd;
+ mmu->pae_root = pae_root;
+ mmu->lm_root = lm_root;
return 0;
}
-static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
-{
- if (vcpu->arch.mmu->direct_map)
- return mmu_alloc_direct_roots(vcpu);
- else
- return mmu_alloc_shadow_roots(vcpu);
-}
-
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
{
int i;
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu->pae_root[i];
- if (root && VALID_PAGE(root)) {
+ if (IS_VALID_PAE_ROOT(root)) {
root &= PT64_BASE_ADDR_MASK;
sp = to_shadow_page(root);
mmu_sync_children(vcpu, sp);
__is_rsvd_bits_set(rsvd_check, sptes[level], level);
if (reserved) {
- pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
+ pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
__func__, addr);
for (level = root; level >= leaf; level--)
- pr_err("------ spte 0x%llx level %d.\n",
- sptes[level], level);
+ pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
+ sptes[level], level,
+ rsvd_check->rsvd_bits_mask[(sptes[level] >> 7) & 1][level-1]);
}
return reserved;
struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
bool async;
+ /*
+ * Retry the page fault if the gfn hit a memslot that is being deleted
+ * or moved. This ensures any existing SPTEs for the old memslot will
+ * be zapped before KVM inserts a new MMIO SPTE for the gfn.
+ */
+ if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
+ return true;
+
/* Don't expose private memslots to L2. */
if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
*pfn = KVM_PFN_NOSLOT;
struct kvm_mmu *context = &vcpu->arch.guest_mmu;
union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu);
- context->shadow_root_level = new_role.base.level;
-
__kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false);
- if (new_role.as_u64 != context->mmu_role.as_u64)
+ if (new_role.as_u64 != context->mmu_role.as_u64) {
shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
+
+ /*
+ * Override the level set by the common init helper, nested TDP
+ * always uses the host's TDP configuration.
+ */
+ context->shadow_root_level = new_role.base.level;
+ }
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
if (r)
goto out;
- r = mmu_alloc_roots(vcpu);
- kvm_mmu_sync_roots(vcpu);
+ r = mmu_alloc_special_roots(vcpu);
+ if (r)
+ goto out;
+ if (vcpu->arch.mmu->direct_map)
+ r = mmu_alloc_direct_roots(vcpu);
+ else
+ r = mmu_alloc_shadow_roots(vcpu);
if (r)
goto out;
+
+ kvm_mmu_sync_roots(vcpu);
+
kvm_mmu_load_pgd(vcpu);
static_call(kvm_x86_tlb_flush_current)(vcpu);
out:
return r;
}
-EXPORT_SYMBOL_GPL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
}
-EXPORT_SYMBOL_GPL(kvm_mmu_unload);
static bool need_remote_flush(u64 old, u64 new)
{
static __always_inline bool
slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
slot_level_handler fn, int start_level, int end_level,
- gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
+ gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
+ bool flush)
{
struct slot_rmap_walk_iterator iterator;
- bool flush = false;
for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
end_gfn, &iterator) {
flush |= fn(kvm, iterator.rmap, memslot);
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
- if (flush && lock_flush_tlb) {
+ if (flush && flush_on_yield) {
kvm_flush_remote_tlbs_with_address(kvm,
start_gfn,
iterator.gfn - start_gfn + 1);
}
}
- if (flush && lock_flush_tlb) {
- kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
- end_gfn - start_gfn + 1);
- flush = false;
- }
-
return flush;
}
static __always_inline bool
slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
slot_level_handler fn, int start_level, int end_level,
- bool lock_flush_tlb)
+ bool flush_on_yield)
{
return slot_handle_level_range(kvm, memslot, fn, start_level,
end_level, memslot->base_gfn,
memslot->base_gfn + memslot->npages - 1,
- lock_flush_tlb);
+ flush_on_yield, false);
}
static __always_inline bool
slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool lock_flush_tlb)
+ slot_level_handler fn, bool flush_on_yield)
{
return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
- PG_LEVEL_4K, lock_flush_tlb);
+ PG_LEVEL_4K, flush_on_yield);
}
static void free_mmu_pages(struct kvm_mmu *mmu)
{
+ if (!tdp_enabled && mmu->pae_root)
+ set_memory_encrypted((unsigned long)mmu->pae_root, 1);
free_page((unsigned long)mmu->pae_root);
free_page((unsigned long)mmu->lm_root);
}
* while the PDP table is a per-vCPU construct that's allocated at MMU
* creation. When emulating 32-bit mode, cr3 is only 32 bits even on
* x86_64. Therefore we need to allocate the PDP table in the first
- * 4GB of memory, which happens to fit the DMA32 zone. Except for
- * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
- * skip allocating the PDP table.
+ * 4GB of memory, which happens to fit the DMA32 zone. TDP paging
+ * generally doesn't use PAE paging and can skip allocating the PDP
+ * table. The main exception, handled here, is SVM's 32-bit NPT. The
+ * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
+ * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots().
*/
if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
return 0;
return -ENOMEM;
mmu->pae_root = page_address(page);
+
+ /*
+ * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
+ * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so
+ * that KVM's writes and the CPU's reads get along. Note, this is
+ * only necessary when using shadow paging, as 64-bit NPT can get at
+ * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
+ * by 32-bit kernels (when KVM itself uses 32-bit NPT).
+ */
+ if (!tdp_enabled)
+ set_memory_decrypted((unsigned long)mmu->pae_root, 1);
+ else
+ WARN_ON_ONCE(shadow_me_mask);
+
for (i = 0; i < 4; ++i)
- mmu->pae_root[i] = INVALID_PAGE;
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
return 0;
}
*/
kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
+ /* In order to ensure all threads see this change when
+ * handling the MMU reload signal, this must happen in the
+ * same critical section as kvm_reload_remote_mmus, and
+ * before kvm_zap_obsolete_pages as kvm_zap_obsolete_pages
+ * could drop the MMU lock and yield.
+ */
+ if (is_tdp_mmu_enabled(kvm))
+ kvm_tdp_mmu_invalidate_all_roots(kvm);
+
/*
* Notify all vcpus to reload its shadow page table and flush TLB.
* Then all vcpus will switch to new shadow page table with the new
kvm_zap_obsolete_pages(kvm);
- if (is_tdp_mmu_enabled(kvm))
- kvm_tdp_mmu_zap_all(kvm);
-
write_unlock(&kvm->mmu_lock);
+
+ if (is_tdp_mmu_enabled(kvm)) {
+ read_lock(&kvm->mmu_lock);
+ kvm_tdp_mmu_zap_invalidated_roots(kvm);
+ read_unlock(&kvm->mmu_lock);
+ }
}
static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int i;
- bool flush;
+ bool flush = false;
write_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
if (start >= end)
continue;
- slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
- PG_LEVEL_4K,
- KVM_MAX_HUGEPAGE_LEVEL,
- start, end - 1, true);
+ flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+ PG_LEVEL_4K,
+ KVM_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, true, flush);
}
}
+ if (flush)
+ kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
+
+ write_unlock(&kvm->mmu_lock);
+
if (is_tdp_mmu_enabled(kvm)) {
- flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
+ flush = false;
+
+ read_lock(&kvm->mmu_lock);
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+ flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
+ gfn_end, flush, true);
if (flush)
- kvm_flush_remote_tlbs(kvm);
- }
+ kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
+ gfn_end);
- write_unlock(&kvm->mmu_lock);
+ read_unlock(&kvm->mmu_lock);
+ }
}
static bool slot_rmap_write_protect(struct kvm *kvm,
write_lock(&kvm->mmu_lock);
flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
- if (is_tdp_mmu_enabled(kvm))
- flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K);
write_unlock(&kvm->mmu_lock);
+ if (is_tdp_mmu_enabled(kvm)) {
+ read_lock(&kvm->mmu_lock);
+ flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
+ read_unlock(&kvm->mmu_lock);
+ }
+
/*
* We can flush all the TLBs out of the mmu lock without TLB
* corruption since we just change the spte from writable to
* spte from present to present (changing the spte from present
* to nonpresent will flush all the TLBs immediately), in other
* words, the only case we care is mmu_spte_update() where we
- * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
- * instead of PT_WRITABLE_MASK, that means it does not depend
- * on PT_WRITABLE_MASK anymore.
+ * have checked Host-writable | MMU-writable instead of
+ * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK
+ * anymore.
*/
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
{
/* FIXME: const-ify all uses of struct kvm_memory_slot. */
struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot;
+ bool flush;
write_lock(&kvm->mmu_lock);
- slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
+ flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
- if (is_tdp_mmu_enabled(kvm))
- kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
+ if (flush)
+ kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
write_unlock(&kvm->mmu_lock);
+
+ if (is_tdp_mmu_enabled(kvm)) {
+ flush = false;
+
+ read_lock(&kvm->mmu_lock);
+ flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush);
+ if (flush)
+ kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+ read_unlock(&kvm->mmu_lock);
+ }
}
void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+ const struct kvm_memory_slot *memslot)
{
/*
* All current use cases for flushing the TLBs for a specific memslot
- * are related to dirty logging, and do the TLB flush out of mmu_lock.
+ * related to dirty logging, and many do the TLB flush out of mmu_lock.
* The interaction between the various operations on memslot must be
* serialized by slots_locks to ensure the TLB flush from one operation
* is observed by any other operation on the same memslot.
write_lock(&kvm->mmu_lock);
flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
- if (is_tdp_mmu_enabled(kvm))
- flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
write_unlock(&kvm->mmu_lock);
+ if (is_tdp_mmu_enabled(kvm)) {
+ read_lock(&kvm->mmu_lock);
+ flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
+ read_unlock(&kvm->mmu_lock);
+ }
+
/*
* It's also safe to flush TLBs out of mmu lock here as currently this
* function is only used for dirty logging, in which case flushing TLB
kmem_cache_destroy(mmu_page_header_cache);
}
-static void kvm_set_mmio_spte_mask(void)
-{
- u64 mask;
-
- /*
- * Set a reserved PA bit in MMIO SPTEs to generate page faults with
- * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT
- * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
- * 52-bit physical addresses then there are no reserved PA bits in the
- * PTEs and so the reserved PA approach must be disabled.
- */
- if (shadow_phys_bits < 52)
- mask = BIT_ULL(51) | PT_PRESENT_MASK;
- else
- mask = 0;
-
- kvm_mmu_set_mmio_spte_mask(mask, ACC_WRITE_MASK | ACC_USER_MASK);
-}
-
static bool get_nx_auto_mode(void)
{
/* Return true when CPU has the bug, and mitigations are ON */
kvm_mmu_reset_all_pte_masks();
- kvm_set_mmio_spte_mask();
-
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
sizeof(struct pte_list_desc),
0, SLAB_ACCOUNT, NULL);
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu->pae_root[i];
- if (root && VALID_PAGE(root)) {
+ if (IS_VALID_PAE_ROOT(root)) {
root &= PT64_BASE_ADDR_MASK;
sp = to_shadow_page(root);
__mmu_spte_walk(vcpu, sp, fn, 2);
#define MMU_WARN_ON(x) do { } while (0)
#endif
+/*
+ * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT
+ * bit, and thus are guaranteed to be non-zero when valid. And, when a guest
+ * PDPTR is !PRESENT, its corresponding PAE root cannot be set to INVALID_PAGE,
+ * as the CPU would treat that as PRESENT PDPTR with reserved bits set. Use
+ * '0' instead of INVALID_PAGE to indicate an invalid PAE root.
+ */
+#define INVALID_PAE_ROOT 0
+#define IS_VALID_PAE_ROOT(x) (!!(x))
+
struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;
u64 *spt;
/* hold the gfn of each spte inside spt */
gfn_t *gfns;
- int root_count; /* Currently serving as active root */
+ /* Currently serving as active root */
+ union {
+ int root_count;
+ refcount_t tdp_mmu_root_count;
+ };
unsigned int unsync_children;
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
DECLARE_BITMAP(unsync_child_bitmap, 512);
return to_shadow_page(__pa(sptep));
}
+static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role)
+{
+ return role.smm ? 1 : 0;
+}
+
static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
{
- return sp->role.smm ? 1 : 0;
+ return kvm_mmu_role_as_id(sp->role);
}
static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
u64 start_gfn, u64 pages);
-static inline void kvm_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- BUG_ON(!sp->root_count);
- lockdep_assert_held(&kvm->mmu_lock);
-
- ++sp->root_count;
-}
-
-static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- lockdep_assert_held(&kvm->mmu_lock);
- --sp->root_count;
-
- return !sp->root_count;
-}
-
/*
* Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault().
*
#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
#define SET_SPTE_SPURIOUS BIT(2)
-int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, kvm_pfn_t pfn, int max_level);
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn,
+ kvm_pfn_t pfn, int max_level);
int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
int max_level, kvm_pfn_t *pfnp,
bool huge_page_disallowed, int *req_level);
#endif
walker->fault.address = addr;
walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
+ walker->fault.async_page_fault = false;
trace_kvm_mmu_walker_error(walker->fault.error_code);
return 0;
nr_present++;
- host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
+ host_writable = sp->spt[i] & shadow_host_writable_mask;
set_spte_ret |= set_spte(vcpu, &sp->spt[i],
pte_access, PG_LEVEL_4K,
#include "spte.h"
#include <asm/e820/api.h>
+#include <asm/vmx.h>
+static bool __read_mostly enable_mmio_caching = true;
+module_param_named(mmio_caching, enable_mmio_caching, bool, 0444);
+
+u64 __read_mostly shadow_host_writable_mask;
+u64 __read_mostly shadow_mmu_writable_mask;
u64 __read_mostly shadow_nx_mask;
u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
u64 __read_mostly shadow_user_mask;
u64 __read_mostly shadow_accessed_mask;
u64 __read_mostly shadow_dirty_mask;
u64 __read_mostly shadow_mmio_value;
+u64 __read_mostly shadow_mmio_mask;
u64 __read_mostly shadow_mmio_access_mask;
u64 __read_mostly shadow_present_mask;
u64 __read_mostly shadow_me_mask;
u64 mask;
WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
- BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
{
u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
- u64 mask = generation_mmio_spte_mask(gen);
+ u64 spte = generation_mmio_spte_mask(gen);
u64 gpa = gfn << PAGE_SHIFT;
+ WARN_ON_ONCE(!shadow_mmio_value);
+
access &= shadow_mmio_access_mask;
- mask |= shadow_mmio_value | access;
- mask |= gpa | shadow_nonpresent_or_rsvd_mask;
- mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
+ spte |= shadow_mmio_value | access;
+ spte |= gpa | shadow_nonpresent_or_rsvd_mask;
+ spte |= (gpa & shadow_nonpresent_or_rsvd_mask)
<< SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
- return mask;
+ return spte;
}
static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
bool can_unsync, bool host_writable, bool ad_disabled,
u64 *new_spte)
{
- u64 spte = 0;
+ u64 spte = SPTE_MMU_PRESENT_MASK;
int ret = 0;
if (ad_disabled)
- spte |= SPTE_AD_DISABLED_MASK;
+ spte |= SPTE_TDP_AD_DISABLED_MASK;
else if (kvm_vcpu_ad_need_write_protect(vcpu))
- spte |= SPTE_AD_WRPROT_ONLY_MASK;
+ spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK;
+
+ /*
+ * Bits 62:52 of PAE SPTEs are reserved. WARN if said bits are set
+ * if PAE paging may be employed (shadow paging or any 32-bit KVM).
+ */
+ WARN_ON_ONCE((!tdp_enabled || !IS_ENABLED(CONFIG_X86_64)) &&
+ (spte & SPTE_TDP_AD_MASK));
/*
* For the EPT case, shadow_present_mask is 0 if hardware
kvm_is_mmio_pfn(pfn));
if (host_writable)
- spte |= SPTE_HOST_WRITEABLE;
+ spte |= shadow_host_writable_mask;
else
pte_access &= ~ACC_WRITE_MASK;
spte |= (u64)pfn << PAGE_SHIFT;
if (pte_access & ACC_WRITE_MASK) {
- spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
+ spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask;
/*
* Optimization: for pte sync, if spte was writable the hash
__func__, gfn);
ret |= SET_SPTE_WRITE_PROTECTED_PT;
pte_access &= ~ACC_WRITE_MASK;
- spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
+ spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
}
}
spte = mark_spte_for_access_track(spte);
out:
+ WARN_ON(is_mmio_spte(spte));
*new_spte = spte;
return ret;
}
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
{
- u64 spte;
+ u64 spte = SPTE_MMU_PRESENT_MASK;
- spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
- shadow_user_mask | shadow_x_mask | shadow_me_mask;
+ spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
+ shadow_user_mask | shadow_x_mask | shadow_me_mask;
if (ad_disabled)
- spte |= SPTE_AD_DISABLED_MASK;
+ spte |= SPTE_TDP_AD_DISABLED_MASK;
else
spte |= shadow_accessed_mask;
new_spte |= (u64)new_pfn << PAGE_SHIFT;
new_spte &= ~PT_WRITABLE_MASK;
- new_spte &= ~SPTE_HOST_WRITEABLE;
+ new_spte &= ~shadow_host_writable_mask;
new_spte = mark_spte_for_access_track(new_spte);
return spte;
}
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
{
BUG_ON((u64)(unsigned)access_mask != access_mask);
- WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN));
WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
- shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
+
+ if (!enable_mmio_caching)
+ mmio_value = 0;
+
+ /*
+ * Disable MMIO caching if the MMIO value collides with the bits that
+ * are used to hold the relocated GFN when the L1TF mitigation is
+ * enabled. This should never fire as there is no known hardware that
+ * can trigger this condition, e.g. SME/SEV CPUs that require a custom
+ * MMIO value are not susceptible to L1TF.
+ */
+ if (WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask <<
+ SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)))
+ mmio_value = 0;
+
+ /*
+ * The masked MMIO value must obviously match itself and a removed SPTE
+ * must not get a false positive. Removed SPTEs and MMIO SPTEs should
+ * never collide as MMIO must set some RWX bits, and removed SPTEs must
+ * not set any RWX bits.
+ */
+ if (WARN_ON((mmio_value & mmio_mask) != mmio_value) ||
+ WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value))
+ mmio_value = 0;
+
+ shadow_mmio_value = mmio_value;
+ shadow_mmio_mask = mmio_mask;
shadow_mmio_access_mask = access_mask;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
-/*
- * Sets the shadow PTE masks used by the MMU.
- *
- * Assumptions:
- * - Setting either @accessed_mask or @dirty_mask requires setting both
- * - At least one of @accessed_mask or @acc_track_mask must be set
- */
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
- u64 acc_track_mask, u64 me_mask)
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
{
- BUG_ON(!dirty_mask != !accessed_mask);
- BUG_ON(!accessed_mask && !acc_track_mask);
- BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
-
- shadow_user_mask = user_mask;
- shadow_accessed_mask = accessed_mask;
- shadow_dirty_mask = dirty_mask;
- shadow_nx_mask = nx_mask;
- shadow_x_mask = x_mask;
- shadow_present_mask = p_mask;
- shadow_acc_track_mask = acc_track_mask;
- shadow_me_mask = me_mask;
+ shadow_user_mask = VMX_EPT_READABLE_MASK;
+ shadow_accessed_mask = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull;
+ shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
+ shadow_nx_mask = 0ull;
+ shadow_x_mask = VMX_EPT_EXECUTABLE_MASK;
+ shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK;
+ shadow_acc_track_mask = VMX_EPT_RWX_MASK;
+ shadow_me_mask = 0ull;
+
+ shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
+ shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE;
+
+ /*
+ * EPT Misconfigurations are generated if the value of bits 2:0
+ * of an EPT paging-structure entry is 110b (write/execute).
+ */
+ kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE,
+ VMX_EPT_RWX_MASK, 0);
}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
+EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks);
void kvm_mmu_reset_all_pte_masks(void)
{
u8 low_phys_bits;
-
- shadow_user_mask = 0;
- shadow_accessed_mask = 0;
- shadow_dirty_mask = 0;
- shadow_nx_mask = 0;
- shadow_x_mask = 0;
- shadow_present_mask = 0;
- shadow_acc_track_mask = 0;
+ u64 mask;
shadow_phys_bits = kvm_get_shadow_phys_bits();
shadow_nonpresent_or_rsvd_lower_gfn_mask =
GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
+
+ shadow_user_mask = PT_USER_MASK;
+ shadow_accessed_mask = PT_ACCESSED_MASK;
+ shadow_dirty_mask = PT_DIRTY_MASK;
+ shadow_nx_mask = PT64_NX_MASK;
+ shadow_x_mask = 0;
+ shadow_present_mask = PT_PRESENT_MASK;
+ shadow_acc_track_mask = 0;
+ shadow_me_mask = sme_me_mask;
+
+ shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITEABLE;
+ shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITEABLE;
+
+ /*
+ * Set a reserved PA bit in MMIO SPTEs to generate page faults with
+ * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT
+ * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
+ * 52-bit physical addresses then there are no reserved PA bits in the
+ * PTEs and so the reserved PA approach must be disabled.
+ */
+ if (shadow_phys_bits < 52)
+ mask = BIT_ULL(51) | PT_PRESENT_MASK;
+ else
+ mask = 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
}
#include "mmu_internal.h"
-#define PT_FIRST_AVAIL_BITS_SHIFT 10
-#define PT64_SECOND_AVAIL_BITS_SHIFT 54
+/*
+ * A MMU present SPTE is backed by actual memory and may or may not be present
+ * in hardware. E.g. MMIO SPTEs are not considered present. Use bit 11, as it
+ * is ignored by all flavors of SPTEs and checking a low bit often generates
+ * better code than for a high bit, e.g. 56+. MMU present checks are pervasive
+ * enough that the improved code generation is noticeable in KVM's footprint.
+ */
+#define SPTE_MMU_PRESENT_MASK BIT_ULL(11)
/*
- * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
- * Access Tracking SPTEs.
+ * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also
+ * be restricted to using write-protection (for L2 when CPU dirty logging, i.e.
+ * PML, is enabled). Use bits 52 and 53 to hold the type of A/D tracking that
+ * is must be employed for a given TDP SPTE.
+ *
+ * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE
+ * paging, including NPT PAE. This scheme works because legacy shadow paging
+ * is guaranteed to have A/D bits and write-protection is forced only for
+ * TDP with CPU dirty logging (PML). If NPT ever gains PML-like support, it
+ * must be restricted to 64-bit KVM.
*/
-#define SPTE_SPECIAL_MASK (3ULL << 52)
-#define SPTE_AD_ENABLED_MASK (0ULL << 52)
-#define SPTE_AD_DISABLED_MASK (1ULL << 52)
-#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
-#define SPTE_MMIO_MASK (3ULL << 52)
+#define SPTE_TDP_AD_SHIFT 52
+#define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_ENABLED_MASK (0ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_DISABLED_MASK (1ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_WRPROT_ONLY_MASK (2ULL << SPTE_TDP_AD_SHIFT)
+static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
+#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9)
+#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10)
+
+/*
+ * The mask/shift to use for saving the original R/X bits when marking the PTE
+ * as not-present for access tracking purposes. We do not save the W bit as the
+ * PTEs being access tracked also need to be dirty tracked, so the W bit will be
+ * restored only when a write is attempted to the page. This mask obviously
+ * must not overlap the A/D type mask.
+ */
+#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
+ PT64_EPT_EXECUTABLE_MASK)
+#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54
+#define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/*
+ * Low ignored bits are at a premium for EPT, use high ignored bits, taking care
+ * to not overlap the A/D type mask or the saved access bits of access-tracked
+ * SPTEs when A/D bits are disabled.
+ */
+#define EPT_SPTE_HOST_WRITABLE BIT_ULL(57)
+#define EPT_SPTE_MMU_WRITABLE BIT_ULL(58)
-#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/* Defined only to keep the above static asserts readable. */
+#undef SHADOW_ACC_TRACK_SAVED_MASK
/*
- * Due to limited space in PTEs, the MMIO generation is a 18 bit subset of
+ * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
* the memslots generation and is derived as follows:
*
- * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
- * Bits 9-17 of the MMIO generation are propagated to spte bits 54-62
+ * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10
+ * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62
*
* The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
* the MMIO generation number, as doing so would require stealing a bit from
*/
#define MMIO_SPTE_GEN_LOW_START 3
-#define MMIO_SPTE_GEN_LOW_END 11
+#define MMIO_SPTE_GEN_LOW_END 10
-#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT
+#define MMIO_SPTE_GEN_HIGH_START 52
#define MMIO_SPTE_GEN_HIGH_END 62
#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
MMIO_SPTE_GEN_LOW_START)
#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
MMIO_SPTE_GEN_HIGH_START)
+static_assert(!(SPTE_MMU_PRESENT_MASK &
+ (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK)));
#define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1)
#define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)
/* remember to adjust the comment above as well if you change these */
-static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 9);
+static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
#define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0)
#define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)
#define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
+extern u64 __read_mostly shadow_host_writable_mask;
+extern u64 __read_mostly shadow_mmu_writable_mask;
extern u64 __read_mostly shadow_nx_mask;
extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
extern u64 __read_mostly shadow_user_mask;
extern u64 __read_mostly shadow_accessed_mask;
extern u64 __read_mostly shadow_dirty_mask;
extern u64 __read_mostly shadow_mmio_value;
+extern u64 __read_mostly shadow_mmio_mask;
extern u64 __read_mostly shadow_mmio_access_mask;
extern u64 __read_mostly shadow_present_mask;
extern u64 __read_mostly shadow_me_mask;
/*
- * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
+ * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK;
* shadow_acc_track_mask is the set of bits to be cleared in non-accessed
* pages.
*/
*/
#define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5
-/*
- * The mask/shift to use for saving the original R/X bits when marking the PTE
- * as not-present for access tracking purposes. We do not save the W bit as the
- * PTEs being access tracked also need to be dirty tracked, so the W bit will be
- * restored only when a write is attempted to the page.
- */
-#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
- PT64_EPT_EXECUTABLE_MASK)
-#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT
-
/*
* If a thread running without exclusive control of the MMU lock must perform a
* multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a
* non-present intermediate value. Other threads which encounter this value
* should not modify the SPTE.
*
- * This constant works because it is considered non-present on both AMD and
- * Intel CPUs and does not create a L1TF vulnerability because the pfn section
- * is zeroed out.
+ * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
+ * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
+ * vulnerability. Use only low bits to avoid 64-bit immediates.
*
* Only used by the TDP MMU.
*/
-#define REMOVED_SPTE (1ull << 59)
+#define REMOVED_SPTE 0x5a0ULL
+
+/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
+static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
static inline bool is_removed_spte(u64 spte)
{
static inline bool is_mmio_spte(u64 spte)
{
- return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
+ return (spte & shadow_mmio_mask) == shadow_mmio_value &&
+ likely(shadow_mmio_value);
+}
+
+static inline bool is_shadow_present_pte(u64 pte)
+{
+ return !!(pte & SPTE_MMU_PRESENT_MASK);
}
static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
static inline bool spte_ad_enabled(u64 spte)
{
- MMU_WARN_ON(is_mmio_spte(spte));
- return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
+ MMU_WARN_ON(!is_shadow_present_pte(spte));
+ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK;
}
static inline bool spte_ad_need_write_protect(u64 spte)
{
- MMU_WARN_ON(is_mmio_spte(spte));
- return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
+ MMU_WARN_ON(!is_shadow_present_pte(spte));
+ /*
+ * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0',
+ * and non-TDP SPTEs will never set these bits. Optimize for 64-bit
+ * TDP and do the A/D type check unconditionally.
+ */
+ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED_MASK;
}
static inline u64 spte_shadow_accessed_mask(u64 spte)
{
- MMU_WARN_ON(is_mmio_spte(spte));
+ MMU_WARN_ON(!is_shadow_present_pte(spte));
return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
}
static inline u64 spte_shadow_dirty_mask(u64 spte)
{
- MMU_WARN_ON(is_mmio_spte(spte));
+ MMU_WARN_ON(!is_shadow_present_pte(spte));
return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
}
return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
}
-static inline bool is_shadow_present_pte(u64 pte)
-{
- return (pte != 0) && !is_mmio_spte(pte) && !is_removed_spte(pte);
-}
-
static inline bool is_large_pte(u64 pte)
{
return pte & PT_PAGE_SIZE_MASK;
static inline bool spte_can_locklessly_be_made_writable(u64 spte)
{
- return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
- (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
+ return (spte & shadow_host_writable_mask) &&
+ (spte & shadow_mmu_writable_mask);
}
static inline u64 get_mmio_spte_generation(u64 spte)
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
}
+static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
+ bool shared)
+{
+ if (shared)
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ else
+ lockdep_assert_held_write(&kvm->mmu_lock);
+}
+
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
{
if (!kvm->arch.tdp_mmu_enabled)
rcu_barrier();
}
-static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
+static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end, bool can_yield, bool flush,
+ bool shared);
+
+static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
{
- if (kvm_mmu_put_root(kvm, root))
- kvm_tdp_mmu_free_root(kvm, root);
+ free_page((unsigned long)sp->spt);
+ kmem_cache_free(mmu_page_header_cache, sp);
}
-static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
- struct kvm_mmu_page *root)
+/*
+ * This is called through call_rcu in order to free TDP page table memory
+ * safely with respect to other kernel threads that may be operating on
+ * the memory.
+ * By only accessing TDP MMU page table memory in an RCU read critical
+ * section, and freeing it after a grace period, lockless access to that
+ * memory won't use it after it is freed.
+ */
+static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
{
- lockdep_assert_held_write(&kvm->mmu_lock);
+ struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
+ rcu_head);
- if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
- return false;
+ tdp_mmu_free_sp(sp);
+}
- kvm_mmu_get_root(kvm, root);
- return true;
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared)
+{
+ gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+ if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
+ return;
+
+ WARN_ON(!root->tdp_mmu_page);
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ list_del_rcu(&root->link);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+
+ zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
+
+ call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
-static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
- struct kvm_mmu_page *root)
+/*
+ * Finds the next valid root after root (or the first valid root if root
+ * is NULL), takes a reference on it, and returns that next root. If root
+ * is not NULL, this thread should have already taken a reference on it, and
+ * that reference will be dropped. If no valid root is found, this
+ * function will return NULL.
+ */
+static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
+ struct kvm_mmu_page *prev_root,
+ bool shared)
{
struct kvm_mmu_page *next_root;
- next_root = list_next_entry(root, link);
- tdp_mmu_put_root(kvm, root);
+ rcu_read_lock();
+
+ if (prev_root)
+ next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &prev_root->link,
+ typeof(*prev_root), link);
+ else
+ next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ typeof(*next_root), link);
+
+ while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
+ next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &next_root->link, typeof(*next_root), link);
+
+ rcu_read_unlock();
+
+ if (prev_root)
+ kvm_tdp_mmu_put_root(kvm, prev_root, shared);
+
return next_root;
}
* This makes it safe to release the MMU lock and yield within the loop, but
* if exiting the loop early, the caller must drop the reference to the most
* recent root. (Unless keeping a live reference is desirable.)
+ *
+ * If shared is set, this function is operating under the MMU lock in read
+ * mode. In the unlikely event that this thread must free a root, the lock
+ * will be temporarily dropped and reacquired in write mode.
*/
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
- for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \
- typeof(*_root), link); \
- tdp_mmu_next_root_valid(_kvm, _root); \
- _root = tdp_mmu_next_root(_kvm, _root))
-
-#define for_each_tdp_mmu_root(_kvm, _root) \
- list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
-
-static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield, bool flush);
-
-void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
-{
- gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
-
- lockdep_assert_held_write(&kvm->mmu_lock);
-
- WARN_ON(root->root_count);
- WARN_ON(!root->tdp_mmu_page);
-
- list_del(&root->link);
-
- zap_gfn_range(kvm, root, 0, max_gfn, false, false);
-
- free_page((unsigned long)root->spt);
- kmem_cache_free(mmu_page_header_cache, root);
-}
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \
+ _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, _shared)) \
+ if (kvm_mmu_page_as_id(_root) != _as_id) { \
+ } else
+
+#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
+ list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \
+ lockdep_is_held_type(&kvm->mmu_lock, 0) || \
+ lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \
+ if (kvm_mmu_page_as_id(_root) != _as_id) { \
+ } else
static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
int level)
return sp;
}
-static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
+hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
{
union kvm_mmu_page_role role;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_page *root;
- role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
+ lockdep_assert_held_write(&kvm->mmu_lock);
- write_lock(&kvm->mmu_lock);
+ role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
/* Check for an existing root before allocating a new one. */
- for_each_tdp_mmu_root(kvm, root) {
- if (root->role.word == role.word) {
- kvm_mmu_get_root(kvm, root);
- write_unlock(&kvm->mmu_lock);
- return root;
- }
+ for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
+ if (root->role.word == role.word &&
+ kvm_tdp_mmu_get_root(kvm, root))
+ goto out;
}
root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
- root->root_count = 1;
-
- list_add(&root->link, &kvm->arch.tdp_mmu_roots);
-
- write_unlock(&kvm->mmu_lock);
-
- return root;
-}
-
-hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *root;
+ refcount_set(&root->tdp_mmu_root_count, 1);
- root = get_tdp_mmu_vcpu_root(vcpu);
- if (!root)
- return INVALID_PAGE;
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+out:
return __pa(root->spt);
}
-static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
-{
- free_page((unsigned long)sp->spt);
- kmem_cache_free(mmu_page_header_cache, sp);
-}
-
-/*
- * This is called through call_rcu in order to free TDP page table memory
- * safely with respect to other kernel threads that may be operating on
- * the memory.
- * By only accessing TDP MMU page table memory in an RCU read critical
- * section, and freeing it after a grace period, lockless access to that
- * memory won't use it after it is freed.
- */
-static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
-{
- struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
- rcu_head);
-
- tdp_mmu_free_sp(sp);
-}
-
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
u64 old_spte, u64 new_spte, int level,
bool shared);
static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
{
- bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
-
if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
return;
if (is_accessed_spte(old_spte) &&
- (!is_accessed_spte(new_spte) || pfn_changed))
+ (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
+ spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
kvm_set_pfn_accessed(spte_to_pfn(old_spte));
}
if (was_leaf && is_dirty_spte(old_spte) &&
- (!is_dirty_spte(new_spte) || pfn_changed))
+ (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
kvm_set_pfn_dirty(spte_to_pfn(old_spte));
/*
}
/*
- * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
- * associated bookkeeping
+ * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
+ * and handle the associated bookkeeping, but do not mark the page dirty
+ * in KVM's dirty bitmaps.
*
* @kvm: kvm instance
* @iter: a tdp_iter instance currently on the SPTE that should be set
* Returns: true if the SPTE was set, false if it was not. If false is returned,
* this function will have no side-effects.
*/
-static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
+static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
{
lockdep_assert_held_read(&kvm->mmu_lock);
* Do not change removed SPTEs. Only the thread that froze the SPTE
* may modify it.
*/
- if (iter->old_spte == REMOVED_SPTE)
+ if (is_removed_spte(iter->old_spte))
return false;
if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
new_spte) != iter->old_spte)
return false;
- handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
- new_spte, iter->level, true);
+ __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ new_spte, iter->level, true);
+ handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
return true;
}
+static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
+{
+ if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
+ return false;
+
+ handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
+ iter->old_spte, new_spte, iter->level);
+ return true;
+}
+
static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
struct tdp_iter *iter)
{
* should be used. If operating under the MMU lock in write mode, the
* use of the removed SPTE should not be necessary.
*/
- WARN_ON(iter->old_spte == REMOVED_SPTE);
+ WARN_ON(is_removed_spte(iter->old_spte));
WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
* Return false if a yield was not needed.
*/
static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
- struct tdp_iter *iter, bool flush)
+ struct tdp_iter *iter, bool flush,
+ bool shared)
{
/* Ensure forward progress has been made before yielding. */
if (iter->next_last_level_gfn == iter->yielded_gfn)
if (flush)
kvm_flush_remote_tlbs(kvm);
- cond_resched_rwlock_write(&kvm->mmu_lock);
+ if (shared)
+ cond_resched_rwlock_read(&kvm->mmu_lock);
+ else
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+
rcu_read_lock();
WARN_ON(iter->gfn > iter->next_last_level_gfn);
* non-root pages mapping GFNs strictly within that range. Returns true if
* SPTEs have been cleared and a TLB flush is needed before releasing the
* MMU lock.
+ *
* If can_yield is true, will release the MMU lock and reschedule if the
* scheduler needs the CPU or there is contention on the MMU lock. If this
* function cannot yield, it will not release the MMU lock or reschedule and
* the caller must ensure it does not supply too large a GFN range, or the
- * operation can cause a soft lockup. Note, in some use cases a flush may be
- * required by prior actions. Ensure the pending flush is performed prior to
- * yielding.
+ * operation can cause a soft lockup.
+ *
+ * If shared is true, this thread holds the MMU lock in read mode and must
+ * account for the possibility that other threads are modifying the paging
+ * structures concurrently. If shared is false, this thread should hold the
+ * MMU lock in write mode.
*/
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield, bool flush)
+ gfn_t start, gfn_t end, bool can_yield, bool flush,
+ bool shared)
{
struct tdp_iter iter;
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) {
+retry:
if (can_yield &&
- tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
+ tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
flush = false;
continue;
}
!is_last_spte(iter.old_spte, iter.level))
continue;
- tdp_mmu_set_spte(kvm, &iter, 0);
- flush = true;
+ if (!shared) {
+ tdp_mmu_set_spte(kvm, &iter, 0);
+ flush = true;
+ } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
+ /*
+ * The iter must explicitly re-read the SPTE because
+ * the atomic cmpxchg failed.
+ */
+ iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ goto retry;
+ }
}
rcu_read_unlock();
* non-root pages mapping GFNs strictly within that range. Returns true if
* SPTEs have been cleared and a TLB flush is needed before releasing the
* MMU lock.
+ *
+ * If shared is true, this thread holds the MMU lock in read mode and must
+ * account for the possibility that other threads are modifying the paging
+ * structures concurrently. If shared is false, this thread should hold the
+ * MMU in write mode.
*/
-bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
- bool can_yield)
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
+ gfn_t end, bool can_yield, bool flush,
+ bool shared)
{
struct kvm_mmu_page *root;
- bool flush = false;
- for_each_tdp_mmu_root_yield_safe(kvm, root)
- flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
+ flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
+ shared);
return flush;
}
void kvm_tdp_mmu_zap_all(struct kvm *kvm)
{
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
- bool flush;
+ bool flush = false;
+ int i;
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+ flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
+ flush, false);
+
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+}
+
+static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
+ struct kvm_mmu_page *prev_root)
+{
+ struct kvm_mmu_page *next_root;
+
+ if (prev_root)
+ next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &prev_root->link,
+ typeof(*prev_root), link);
+ else
+ next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ typeof(*next_root), link);
+
+ while (next_root && !(next_root->role.invalid &&
+ refcount_read(&next_root->tdp_mmu_root_count)))
+ next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &next_root->link,
+ typeof(*next_root), link);
+
+ return next_root;
+}
+
+/*
+ * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
+ * invalidated root, they will not be freed until this function drops the
+ * reference. Before dropping that reference, tear down the paging
+ * structure so that whichever thread does drop the last reference
+ * only has to do a trivial amount of work. Since the roots are invalid,
+ * no new SPTEs should be created under them.
+ */
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
+{
+ gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+ struct kvm_mmu_page *next_root;
+ struct kvm_mmu_page *root;
+ bool flush = false;
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
+
+ rcu_read_lock();
+
+ root = next_invalidated_root(kvm, NULL);
+
+ while (root) {
+ next_root = next_invalidated_root(kvm, root);
+
+ rcu_read_unlock();
+
+ flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
+ true);
+
+ /*
+ * Put the reference acquired in
+ * kvm_tdp_mmu_invalidate_roots
+ */
+ kvm_tdp_mmu_put_root(kvm, root, true);
+
+ root = next_root;
+
+ rcu_read_lock();
+ }
+
+ rcu_read_unlock();
- flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
if (flush)
kvm_flush_remote_tlbs(kvm);
}
+/*
+ * Mark each TDP MMU root as invalid so that other threads
+ * will drop their references and allow the root count to
+ * go to 0.
+ *
+ * Also take a reference on all roots so that this thread
+ * can do the bulk of the work required to free the roots
+ * once they are invalidated. Without this reference, a
+ * vCPU thread might drop the last reference to a root and
+ * get stuck with tearing down the entire paging structure.
+ *
+ * Roots which have a zero refcount should be skipped as
+ * they're already being torn down.
+ * Already invalid roots should be referenced again so that
+ * they aren't freed before kvm_tdp_mmu_zap_all_fast is
+ * done with them.
+ *
+ * This has essentially the same effect for the TDP MMU
+ * as updating mmu_valid_gen does for the shadow MMU.
+ */
+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
+{
+ struct kvm_mmu_page *root;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
+ if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
+ root->role.invalid = true;
+}
+
/*
* Installs a last-level SPTE to handle a TDP page fault.
* (NPT/EPT violation/misconfiguration)
trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
new_spte);
ret = RET_PF_EMULATE;
- } else
+ } else {
trace_kvm_mmu_set_spte(iter->level, iter->gfn,
rcu_dereference(iter->sptep));
+ }
- trace_kvm_mmu_set_spte(iter->level, iter->gfn,
- rcu_dereference(iter->sptep));
if (!prefault)
vcpu->stat.pf_fixed++;
return ret;
}
-static __always_inline int
-kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
- unsigned long start,
- unsigned long end,
- unsigned long data,
- int (*handler)(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root,
- gfn_t start,
- gfn_t end,
- unsigned long data))
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+ bool flush)
{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
struct kvm_mmu_page *root;
- int ret = 0;
- int as_id;
-
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
- as_id = kvm_mmu_page_as_id(root);
- slots = __kvm_memslots(kvm, as_id);
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gfn_start, gfn_end;
-
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
- /*
- * {gfn(page) | page intersects with [hva_start, hva_end)} =
- * {gfn_start, gfn_start+1, ..., gfn_end-1}.
- */
- gfn_start = hva_to_gfn_memslot(hva_start, memslot);
- gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
- ret |= handler(kvm, memslot, root, gfn_start,
- gfn_end, data);
- }
- }
+ for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
+ flush |= zap_gfn_range(kvm, root, range->start, range->end,
+ range->may_block, flush, false);
- return ret;
+ return flush;
}
-static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root, gfn_t start,
- gfn_t end, unsigned long unused)
-{
- return zap_gfn_range(kvm, root, start, end, false, false);
-}
+typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_gfn_range *range);
-int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
- unsigned long end)
+static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
+ struct kvm_gfn_range *range,
+ tdp_handler_t handler)
{
- return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
- zap_gfn_range_hva_wrapper);
+ struct kvm_mmu_page *root;
+ struct tdp_iter iter;
+ bool ret = false;
+
+ rcu_read_lock();
+
+ /*
+ * Don't support rescheduling, none of the MMU notifiers that funnel
+ * into this helper allow blocking; it'd be dead, wasteful code.
+ */
+ for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
+ tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
+ ret |= handler(kvm, &iter, range);
+ }
+
+ rcu_read_unlock();
+
+ return ret;
}
/*
* Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
* if any of the GFNs in the range have been accessed.
*/
-static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root, gfn_t start, gfn_t end,
- unsigned long unused)
+static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_gfn_range *range)
{
- struct tdp_iter iter;
- int young = 0;
u64 new_spte = 0;
- rcu_read_lock();
+ /* If we have a non-accessed entry we don't need to change the pte. */
+ if (!is_accessed_spte(iter->old_spte))
+ return false;
- tdp_root_for_each_leaf_pte(iter, root, start, end) {
+ new_spte = iter->old_spte;
+
+ if (spte_ad_enabled(new_spte)) {
+ new_spte &= ~shadow_accessed_mask;
+ } else {
/*
- * If we have a non-accessed entry we don't need to change the
- * pte.
+ * Capture the dirty status of the page, so that it doesn't get
+ * lost when the SPTE is marked for access tracking.
*/
- if (!is_accessed_spte(iter.old_spte))
- continue;
-
- new_spte = iter.old_spte;
-
- if (spte_ad_enabled(new_spte)) {
- clear_bit((ffs(shadow_accessed_mask) - 1),
- (unsigned long *)&new_spte);
- } else {
- /*
- * Capture the dirty status of the page, so that it doesn't get
- * lost when the SPTE is marked for access tracking.
- */
- if (is_writable_pte(new_spte))
- kvm_set_pfn_dirty(spte_to_pfn(new_spte));
+ if (is_writable_pte(new_spte))
+ kvm_set_pfn_dirty(spte_to_pfn(new_spte));
- new_spte = mark_spte_for_access_track(new_spte);
- }
- new_spte &= ~shadow_dirty_mask;
-
- tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
- young = 1;
-
- trace_kvm_age_page(iter.gfn, iter.level, slot, young);
+ new_spte = mark_spte_for_access_track(new_spte);
}
- rcu_read_unlock();
+ tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
- return young;
+ return true;
}
-int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
- unsigned long end)
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
- age_gfn_range);
+ return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
}
-static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
- unsigned long unused2)
+static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_gfn_range *range)
{
- struct tdp_iter iter;
-
- tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
- if (is_accessed_spte(iter.old_spte))
- return 1;
-
- return 0;
+ return is_accessed_spte(iter->old_spte);
}
-int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
- test_age_gfn);
+ return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
}
-/*
- * Handle the changed_pte MMU notifier for the TDP MMU.
- * data is a pointer to the new pte_t mapping the HVA specified by the MMU
- * notifier.
- * Returns non-zero if a flush is needed before releasing the MMU lock.
- */
-static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
- unsigned long data)
+static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_gfn_range *range)
{
- struct tdp_iter iter;
- pte_t *ptep = (pte_t *)data;
- kvm_pfn_t new_pfn;
u64 new_spte;
- int need_flush = 0;
-
- rcu_read_lock();
- WARN_ON(pte_huge(*ptep));
+ /* Huge pages aren't expected to be modified without first being zapped. */
+ WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
- new_pfn = pte_pfn(*ptep);
-
- tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
- if (iter.level != PG_LEVEL_4K)
- continue;
-
- if (!is_shadow_present_pte(iter.old_spte))
- break;
-
- tdp_mmu_set_spte(kvm, &iter, 0);
-
- kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
+ if (iter->level != PG_LEVEL_4K ||
+ !is_shadow_present_pte(iter->old_spte))
+ return false;
- if (!pte_write(*ptep)) {
- new_spte = kvm_mmu_changed_pte_notifier_make_spte(
- iter.old_spte, new_pfn);
+ /*
+ * Note, when changing a read-only SPTE, it's not strictly necessary to
+ * zero the SPTE before setting the new PFN, but doing so preserves the
+ * invariant that the PFN of a present * leaf SPTE can never change.
+ * See __handle_changed_spte().
+ */
+ tdp_mmu_set_spte(kvm, iter, 0);
- tdp_mmu_set_spte(kvm, &iter, new_spte);
- }
+ if (!pte_write(range->pte)) {
+ new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
+ pte_pfn(range->pte));
- need_flush = 1;
+ tdp_mmu_set_spte(kvm, iter, new_spte);
}
- if (need_flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
-
- rcu_read_unlock();
-
- return 0;
+ return true;
}
-int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
- pte_t *host_ptep)
+/*
+ * Handle the changed_pte MMU notifier for the TDP MMU.
+ * data is a pointer to the new pte_t mapping the HVA specified by the MMU
+ * notifier.
+ * Returns non-zero if a flush is needed before releasing the MMU lock.
+ */
+bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
- (unsigned long)host_ptep,
- set_tdp_spte);
+ bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
+
+ /* FIXME: return 'flush' instead of flushing here. */
+ if (flush)
+ kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
+
+ return false;
}
/*
for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
min_level, start, end) {
- if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
if (!is_shadow_present_pte(iter.old_spte) ||
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
- tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
+ if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
+ new_spte)) {
+ /*
+ * The iter must explicitly re-read the SPTE because
+ * the atomic cmpxchg failed.
+ */
+ iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ goto retry;
+ }
spte_set = true;
}
int min_level)
{
struct kvm_mmu_page *root;
- int root_as_id;
bool spte_set = false;
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages, min_level);
- }
return spte_set;
}
rcu_read_lock();
tdp_root_for_each_leaf_pte(iter, root, start, end) {
- if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
if (spte_ad_need_write_protect(iter.old_spte)) {
continue;
}
- tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
+ if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
+ new_spte)) {
+ /*
+ * The iter must explicitly re-read the SPTE because
+ * the atomic cmpxchg failed.
+ */
+ iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ goto retry;
+ }
spte_set = true;
}
bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
- int root_as_id;
bool spte_set = false;
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages);
- }
return spte_set;
}
bool wrprot)
{
struct kvm_mmu_page *root;
- int root_as_id;
lockdep_assert_held_write(&kvm->mmu_lock);
- for_each_tdp_mmu_root(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
-
+ for_each_tdp_mmu_root(kvm, root, slot->as_id)
clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
- }
}
/*
* Clear leaf entries which could be replaced by large mappings, for
* GFNs within the slot.
*/
-static void zap_collapsible_spte_range(struct kvm *kvm,
+static bool zap_collapsible_spte_range(struct kvm *kvm,
struct kvm_mmu_page *root,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot,
+ bool flush)
{
gfn_t start = slot->base_gfn;
gfn_t end = start + slot->npages;
struct tdp_iter iter;
kvm_pfn_t pfn;
- bool spte_set = false;
rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) {
- if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
- spte_set = false;
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
+ flush = false;
continue;
}
pfn, PG_LEVEL_NUM))
continue;
- tdp_mmu_set_spte(kvm, &iter, 0);
-
- spte_set = true;
+ if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
+ /*
+ * The iter must explicitly re-read the SPTE because
+ * the atomic cmpxchg failed.
+ */
+ iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ goto retry;
+ }
+ flush = true;
}
rcu_read_unlock();
- if (spte_set)
- kvm_flush_remote_tlbs(kvm);
+
+ return flush;
}
/*
* Clear non-leaf entries (and free associated page tables) which could
* be replaced by large mappings, for GFNs within the slot.
*/
-void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- struct kvm_memory_slot *slot)
+bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ bool flush)
{
struct kvm_mmu_page *root;
- int root_as_id;
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
+ lockdep_assert_held_read(&kvm->mmu_lock);
- zap_collapsible_spte_range(kvm, root, slot);
- }
+ for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+ flush = zap_collapsible_spte_range(kvm, root, slot, flush);
+
+ return flush;
}
/*
* Removes write access on the last level SPTE mapping this GFN and unsets the
- * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
+ * MMU-writable bit to ensure future writes continue to be intercepted.
* Returns true if an SPTE was set and a TLB flush is needed.
*/
static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
break;
new_spte = iter.old_spte &
- ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
+ ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
tdp_mmu_set_spte(kvm, &iter, new_spte);
spte_set = true;
/*
* Removes write access on the last level SPTE mapping this GFN and unsets the
- * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
+ * MMU-writable bit to ensure future writes continue to be intercepted.
* Returns true if an SPTE was set and a TLB flush is needed.
*/
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn)
{
struct kvm_mmu_page *root;
- int root_as_id;
bool spte_set = false;
lockdep_assert_held_write(&kvm->mmu_lock);
- for_each_tdp_mmu_root(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
-
+ for_each_tdp_mmu_root(kvm, root, slot->as_id)
spte_set |= write_protect_gfn(kvm, root, gfn);
- }
+
return spte_set;
}
#include <linux/kvm_host.h>
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
-void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
-bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
- bool can_yield);
-static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start,
- gfn_t end)
+__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm,
+ struct kvm_mmu_page *root)
{
- return __kvm_tdp_mmu_zap_gfn_range(kvm, start, end, true);
+ if (root->role.invalid)
+ return false;
+
+ return refcount_inc_not_zero(&root->tdp_mmu_root_count);
+}
+
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared);
+
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
+ gfn_t end, bool can_yield, bool flush,
+ bool shared);
+static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
+ gfn_t start, gfn_t end, bool flush,
+ bool shared)
+{
+ return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush,
+ shared);
}
static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
* of the shadow page's gfn range and stop iterating before yielding.
*/
lockdep_assert_held_write(&kvm->mmu_lock);
- return __kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, end, false);
+ return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp),
+ sp->gfn, end, false, false, false);
}
+
void kvm_tdp_mmu_zap_all(struct kvm *kvm);
+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
int map_writable, int max_level, kvm_pfn_t pfn,
bool prefault);
-int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
- unsigned long end);
-
-int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
- unsigned long end);
-int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva);
-
-int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
- pte_t *host_ptep);
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+ bool flush);
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
int min_level);
struct kvm_memory_slot *slot,
gfn_t gfn, unsigned long mask,
bool wrprot);
-void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- struct kvm_memory_slot *slot);
+bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ bool flush);
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn);
if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
return -EINVAL;
- if (!svm->vcpu.arch.apic->regs)
+ if (!vcpu->arch.apic->regs)
return -EINVAL;
if (kvm_apicv_activated(vcpu->kvm)) {
return ret;
}
- svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
+ svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
/* Setting AVIC backing page address in the phy APIC ID table */
entry = avic_get_physical_id_entry(vcpu, id);
}
}
-int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
u32 icrl = svm->vmcb->control.exit_info_1;
u32 id = svm->vmcb->control.exit_info_2 >> 32;
u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
- struct kvm_lapic *apic = svm->vcpu.arch.apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
- trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
+ trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
switch (id) {
case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
* set the appropriate IRR bits on the valid target
* vcpus. So, we just need to kick the appropriate vcpu.
*/
- avic_kick_target_vcpus(svm->vcpu.kvm, apic, icrl, icrh);
+ avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh);
break;
case AVIC_IPI_FAILURE_INVALID_TARGET:
WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
- index, svm->vcpu.vcpu_id, icrh, icrl);
+ index, vcpu->vcpu_id, icrh, icrl);
break;
case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
WARN_ONCE(1, "Invalid backing page\n");
return ret;
}
-int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
int ret = 0;
u32 offset = svm->vmcb->control.exit_info_1 &
AVIC_UNACCEL_ACCESS_OFFSET_MASK;
AVIC_UNACCEL_ACCESS_WRITE_MASK;
bool trap = is_avic_unaccelerated_access_trap(offset);
- trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
+ trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
trap, write, vector);
if (trap) {
/* Handling Trap */
ret = avic_unaccel_trap_write(svm);
} else {
/* Handling Fault */
- ret = kvm_emulate_instruction(&svm->vcpu, 0);
+ ret = kvm_emulate_instruction(vcpu, 0);
}
return ret;
if (!avic || !irqchip_in_kernel(vcpu->kvm))
return 0;
- ret = avic_init_backing_page(&svm->vcpu);
+ ret = avic_init_backing_page(vcpu);
if (ret)
return ret;
#include "lapic.h"
#include "svm.h"
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
+
static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *hsave = svm->nested.hsave;
WARN_ON(mmu_is_nested(vcpu));
vcpu->arch.mmu = &vcpu->arch.guest_mmu;
- kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer,
+ kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
+ svm->vmcb01.ptr->save.efer,
svm->nested.ctl.nested_cr3);
vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
return;
c = &svm->vmcb->control;
- h = &svm->nested.hsave->control;
+ h = &svm->vmcb01.ptr->control;
g = &svm->nested.ctl;
for (i = 0; i < MAX_INTERCEPT; i++)
return true;
}
-static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+/*
+ * Bits 11:0 of bitmap address are ignored by hardware
+ */
+static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size)
{
- struct vcpu_svm *svm = to_svm(vcpu);
+ u64 addr = PAGE_ALIGN(pa);
- if (WARN_ON(!is_guest_mode(vcpu)))
- return true;
-
- if (!nested_svm_vmrun_msrpm(svm)) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror =
- KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
- return false;
- }
-
- return true;
+ return kvm_vcpu_is_legal_gpa(vcpu, addr) &&
+ kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1);
}
-static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
+static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
+ struct vmcb_control_area *control)
{
- if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0)
+ if (CC(!vmcb_is_intercept(control, INTERCEPT_VMRUN)))
return false;
- if (control->asid == 0)
+ if (CC(control->asid == 0))
return false;
- if ((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
- !npt_enabled)
+ if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled))
+ return false;
+
+ if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa,
+ MSRPM_SIZE)))
+ return false;
+ if (CC(!nested_svm_check_bitmap_pa(vcpu, control->iopm_base_pa,
+ IOPM_SIZE)))
return false;
return true;
}
-static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu,
+ struct vmcb_save_area *save)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
- bool vmcb12_lma;
+ /*
+ * These checks are also performed by KVM_SET_SREGS,
+ * except that EFER.LMA is not checked by SVM against
+ * CR0.PG && EFER.LME.
+ */
+ if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) {
+ if (CC(!(save->cr4 & X86_CR4_PAE)) ||
+ CC(!(save->cr0 & X86_CR0_PE)) ||
+ CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3)))
+ return false;
+ }
+
+ if (CC(!kvm_is_valid_cr4(vcpu, save->cr4)))
+ return false;
+
+ return true;
+}
+/* Common checks that apply to both L1 and L2 state. */
+static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu,
+ struct vmcb_save_area *save)
+{
/*
* FIXME: these should be done after copying the fields,
* to avoid TOC/TOU races. For these save area checks
* kvm_set_cr4 handle failure; EFER_SVME is an exception
* so it is force-set later in nested_prepare_vmcb_save.
*/
- if ((vmcb12->save.efer & EFER_SVME) == 0)
+ if (CC(!(save->efer & EFER_SVME)))
return false;
- if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW))
+ if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
+ CC(save->cr0 & ~0xffffffffULL))
return false;
- if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7))
+ if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
return false;
- vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG);
+ if (!nested_vmcb_check_cr3_cr4(vcpu, save))
+ return false;
- if (vmcb12_lma) {
- if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
- !(vmcb12->save.cr0 & X86_CR0_PE) ||
- kvm_vcpu_is_illegal_gpa(vcpu, vmcb12->save.cr3))
- return false;
- }
- if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
+ if (CC(!kvm_valid_efer(vcpu, save->efer)))
return false;
return true;
}
-static void load_nested_vmcb_control(struct vcpu_svm *svm,
- struct vmcb_control_area *control)
+static void nested_load_control_from_vmcb12(struct vcpu_svm *svm,
+ struct vmcb_control_area *control)
{
copy_vmcb_control_area(&svm->nested.ctl, control);
/*
* Synchronize fields that are written by the processor, so that
- * they can be copied back into the nested_vmcb.
+ * they can be copied back into the vmcb12.
*/
-void sync_nested_vmcb_control(struct vcpu_svm *svm)
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
{
u32 mask;
svm->nested.ctl.event_inj = svm->vmcb->control.event_inj;
* Transfer any event that L0 or L1 wanted to inject into L2 to
* EXIT_INT_INFO.
*/
-static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
- struct vmcb *vmcb12)
+static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
+ struct vmcb *vmcb12)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
u32 exit_int_info = 0;
static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
bool nested_npt)
{
- if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
+ if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3)))
return -EINVAL;
if (!nested_npt && is_pae_paging(vcpu) &&
(cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) {
- if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)))
return -EINVAL;
}
return 0;
}
-static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
{
+ if (!svm->nested.vmcb02.ptr)
+ return;
+
+ /* FIXME: merge g_pat from vmcb01 and vmcb12. */
+ svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat;
+}
+
+static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+{
+ bool new_vmcb12 = false;
+
+ nested_vmcb02_compute_g_pat(svm);
+
/* Load the nested guest state */
- svm->vmcb->save.es = vmcb12->save.es;
- svm->vmcb->save.cs = vmcb12->save.cs;
- svm->vmcb->save.ss = vmcb12->save.ss;
- svm->vmcb->save.ds = vmcb12->save.ds;
- svm->vmcb->save.gdtr = vmcb12->save.gdtr;
- svm->vmcb->save.idtr = vmcb12->save.idtr;
+ if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
+ new_vmcb12 = true;
+ svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
+ }
+
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
+ svm->vmcb->save.es = vmcb12->save.es;
+ svm->vmcb->save.cs = vmcb12->save.cs;
+ svm->vmcb->save.ss = vmcb12->save.ss;
+ svm->vmcb->save.ds = vmcb12->save.ds;
+ svm->vmcb->save.cpl = vmcb12->save.cpl;
+ vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
+ }
+
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
+ svm->vmcb->save.gdtr = vmcb12->save.gdtr;
+ svm->vmcb->save.idtr = vmcb12->save.idtr;
+ vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+ }
+
kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
/*
svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
- svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
+ svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
svm->vmcb->save.rax = vmcb12->save.rax;
svm->vmcb->save.rsp = vmcb12->save.rsp;
svm->vmcb->save.rip = vmcb12->save.rip;
- svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
- svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
- svm->vmcb->save.cpl = vmcb12->save.cpl;
+
+ /* These bits will be set properly on the first execution when new_vmc12 is true */
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
+ svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
+ svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
+ vmcb_mark_dirty(svm->vmcb, VMCB_DR);
+ }
}
-static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
+static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
{
const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
+ /*
+ * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
+ * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
+ */
+
+ /*
+ * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id,
+ * avic_physical_id.
+ */
+ WARN_ON(svm->vmcb01.ptr->control.int_ctl & AVIC_ENABLE_MASK);
+
+ /* Copied from vmcb01. msrpm_base can be overwritten later. */
+ svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl;
+ svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa;
+ svm->vmcb->control.msrpm_base_pa = svm->vmcb01.ptr->control.msrpm_base_pa;
+
+ /* Done at vmrun: asid. */
+
+ /* Also overwritten later if necessary. */
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+ /* nested_cr3. */
if (nested_npt_enabled(svm))
nested_svm_init_mmu_context(&svm->vcpu);
svm->vmcb->control.int_ctl =
(svm->nested.ctl.int_ctl & ~mask) |
- (svm->nested.hsave->control.int_ctl & mask);
+ (svm->vmcb01.ptr->control.int_ctl & mask);
svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext;
svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
enter_guest_mode(&svm->vcpu);
/*
- * Merge guest and host intercepts - must be called with vcpu in
- * guest-mode to take affect here
+ * Merge guest and host intercepts - must be called with vcpu in
+ * guest-mode to take effect.
*/
recalc_intercepts(svm);
+}
- vmcb_mark_all_dirty(svm->vmcb);
+static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+ /*
+ * Some VMCB state is shared between L1 and L2 and thus has to be
+ * moved at the time of nested vmrun and vmexit.
+ *
+ * VMLOAD/VMSAVE state would also belong in this category, but KVM
+ * always performs VMLOAD and VMSAVE from the VMCB01.
+ */
+ to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl;
}
-int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
struct vmcb *vmcb12)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
int ret;
trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
svm->nested.vmcb12_gpa = vmcb12_gpa;
- nested_prepare_vmcb_control(svm);
- nested_prepare_vmcb_save(svm, vmcb12);
+
+ WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr);
+
+ nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
+
+ svm_switch_vmcb(svm, &svm->nested.vmcb02);
+ nested_vmcb02_prepare_control(svm);
+ nested_vmcb02_prepare_save(svm, vmcb12);
ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
nested_npt_enabled(svm));
return ret;
if (!npt_enabled)
- svm->vcpu.arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
+ vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
svm_set_gif(svm, true);
return 0;
}
-int nested_svm_vmrun(struct vcpu_svm *svm)
+int nested_svm_vmrun(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
int ret;
struct vmcb *vmcb12;
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
u64 vmcb12_gpa;
- if (is_smm(&svm->vcpu)) {
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ ++vcpu->stat.nested_run;
+
+ if (is_smm(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
vmcb12_gpa = svm->vmcb->save.rax;
- ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map);
+ ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map);
if (ret == -EINVAL) {
- kvm_inject_gp(&svm->vcpu, 0);
+ kvm_inject_gp(vcpu, 0);
return 1;
} else if (ret) {
- return kvm_skip_emulated_instruction(&svm->vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
}
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
+ ret = kvm_skip_emulated_instruction(vcpu);
vmcb12 = map.hva;
if (WARN_ON_ONCE(!svm->nested.initialized))
return -EINVAL;
- load_nested_vmcb_control(svm, &vmcb12->control);
+ nested_load_control_from_vmcb12(svm, &vmcb12->control);
- if (!nested_vmcb_check_save(svm, vmcb12) ||
- !nested_vmcb_check_controls(&svm->nested.ctl)) {
+ if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) ||
+ !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) {
vmcb12->control.exit_code = SVM_EXIT_ERR;
vmcb12->control.exit_code_hi = 0;
vmcb12->control.exit_info_1 = 0;
/* Clear internal status */
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
/*
- * Save the old vmcb, so we don't need to pick what we save, but can
- * restore everything when a VMEXIT occurs
+ * Since vmcb01 is not in use, we can use it to store some of the L1
+ * state.
*/
- hsave->save.es = vmcb->save.es;
- hsave->save.cs = vmcb->save.cs;
- hsave->save.ss = vmcb->save.ss;
- hsave->save.ds = vmcb->save.ds;
- hsave->save.gdtr = vmcb->save.gdtr;
- hsave->save.idtr = vmcb->save.idtr;
- hsave->save.efer = svm->vcpu.arch.efer;
- hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
- hsave->save.cr4 = svm->vcpu.arch.cr4;
- hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
- hsave->save.rip = kvm_rip_read(&svm->vcpu);
- hsave->save.rsp = vmcb->save.rsp;
- hsave->save.rax = vmcb->save.rax;
- if (npt_enabled)
- hsave->save.cr3 = vmcb->save.cr3;
- else
- hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
-
- copy_vmcb_control_area(&hsave->control, &vmcb->control);
+ svm->vmcb01.ptr->save.efer = vcpu->arch.efer;
+ svm->vmcb01.ptr->save.cr0 = kvm_read_cr0(vcpu);
+ svm->vmcb01.ptr->save.cr4 = vcpu->arch.cr4;
+ svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu);
+ svm->vmcb01.ptr->save.rip = kvm_rip_read(vcpu);
+
+ if (!npt_enabled)
+ svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu);
svm->nested.nested_run_pending = 1;
- if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12))
+ if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12))
goto out_exit_err;
if (nested_svm_vmrun_msrpm(svm))
nested_svm_vmexit(svm);
out:
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ kvm_vcpu_unmap(vcpu, &map, true);
return ret;
}
int nested_svm_vmexit(struct vcpu_svm *svm)
{
- int rc;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
struct vmcb *vmcb12;
- struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
+ int rc;
- rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
+ /* Triple faults in L2 should never escape. */
+ WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
+
+ rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
if (rc) {
if (rc == -EINVAL)
- kvm_inject_gp(&svm->vcpu, 0);
+ kvm_inject_gp(vcpu, 0);
return 1;
}
vmcb12 = map.hva;
/* Exit Guest-Mode */
- leave_guest_mode(&svm->vcpu);
+ leave_guest_mode(vcpu);
svm->nested.vmcb12_gpa = 0;
WARN_ON_ONCE(svm->nested.nested_run_pending);
- kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
/* in case we halted in L2 */
svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
vmcb12->save.gdtr = vmcb->save.gdtr;
vmcb12->save.idtr = vmcb->save.idtr;
vmcb12->save.efer = svm->vcpu.arch.efer;
- vmcb12->save.cr0 = kvm_read_cr0(&svm->vcpu);
- vmcb12->save.cr3 = kvm_read_cr3(&svm->vcpu);
+ vmcb12->save.cr0 = kvm_read_cr0(vcpu);
+ vmcb12->save.cr3 = kvm_read_cr3(vcpu);
vmcb12->save.cr2 = vmcb->save.cr2;
vmcb12->save.cr4 = svm->vcpu.arch.cr4;
- vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu);
- vmcb12->save.rip = kvm_rip_read(&svm->vcpu);
- vmcb12->save.rsp = kvm_rsp_read(&svm->vcpu);
- vmcb12->save.rax = kvm_rax_read(&svm->vcpu);
+ vmcb12->save.rflags = kvm_get_rflags(vcpu);
+ vmcb12->save.rip = kvm_rip_read(vcpu);
+ vmcb12->save.rsp = kvm_rsp_read(vcpu);
+ vmcb12->save.rax = kvm_rax_read(vcpu);
vmcb12->save.dr7 = vmcb->save.dr7;
vmcb12->save.dr6 = svm->vcpu.arch.dr6;
vmcb12->save.cpl = vmcb->save.cpl;
vmcb12->control.exit_info_2 = vmcb->control.exit_info_2;
if (vmcb12->control.exit_code != SVM_EXIT_ERR)
- nested_vmcb_save_pending_event(svm, vmcb12);
+ nested_save_pending_event_to_vmcb12(svm, vmcb12);
if (svm->nrips_enabled)
vmcb12->control.next_rip = vmcb->control.next_rip;
vmcb12->control.pause_filter_thresh =
svm->vmcb->control.pause_filter_thresh;
- /* Restore the original control entries */
- copy_vmcb_control_area(&vmcb->control, &hsave->control);
+ nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
+
+ svm_switch_vmcb(svm, &svm->vmcb01);
+ WARN_ON_ONCE(svm->vmcb->control.exit_code != SVM_EXIT_VMRUN);
- /* On vmexit the GIF is set to false */
+ /*
+ * On vmexit the GIF is set to false and
+ * no event can be injected in L1.
+ */
svm_set_gif(svm, false);
+ svm->vmcb->control.exit_int_info = 0;
- svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
- svm->vcpu.arch.l1_tsc_offset;
+ svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
+ if (svm->vmcb->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
+ svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ }
svm->nested.ctl.nested_cr3 = 0;
- /* Restore selected save entries */
- svm->vmcb->save.es = hsave->save.es;
- svm->vmcb->save.cs = hsave->save.cs;
- svm->vmcb->save.ss = hsave->save.ss;
- svm->vmcb->save.ds = hsave->save.ds;
- svm->vmcb->save.gdtr = hsave->save.gdtr;
- svm->vmcb->save.idtr = hsave->save.idtr;
- kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
- kvm_set_rflags(&svm->vcpu, hsave->save.rflags | X86_EFLAGS_FIXED);
- svm_set_efer(&svm->vcpu, hsave->save.efer);
- svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
- svm_set_cr4(&svm->vcpu, hsave->save.cr4);
- kvm_rax_write(&svm->vcpu, hsave->save.rax);
- kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
- kvm_rip_write(&svm->vcpu, hsave->save.rip);
- svm->vmcb->save.dr7 = DR7_FIXED_1;
- svm->vmcb->save.cpl = 0;
- svm->vmcb->control.exit_int_info = 0;
+ /*
+ * Restore processor state that had been saved in vmcb01
+ */
+ kvm_set_rflags(vcpu, svm->vmcb->save.rflags);
+ svm_set_efer(vcpu, svm->vmcb->save.efer);
+ svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
+ svm_set_cr4(vcpu, svm->vmcb->save.cr4);
+ kvm_rax_write(vcpu, svm->vmcb->save.rax);
+ kvm_rsp_write(vcpu, svm->vmcb->save.rsp);
+ kvm_rip_write(vcpu, svm->vmcb->save.rip);
- vmcb_mark_all_dirty(svm->vmcb);
+ svm->vcpu.arch.dr7 = DR7_FIXED_1;
+ kvm_update_dr7(&svm->vcpu);
trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
vmcb12->control.exit_info_1,
vmcb12->control.exit_int_info_err,
KVM_ISA_SVM);
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ kvm_vcpu_unmap(vcpu, &map, true);
- nested_svm_uninit_mmu_context(&svm->vcpu);
+ nested_svm_uninit_mmu_context(vcpu);
- rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
+ rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false);
if (rc)
return 1;
- if (npt_enabled)
- svm->vmcb->save.cr3 = hsave->save.cr3;
-
/*
* Drop what we picked up for L2 via svm_complete_interrupts() so it
* doesn't end up in L1.
*/
svm->vcpu.arch.nmi_injected = false;
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
+ /*
+ * If we are here following the completion of a VMRUN that
+ * is being single-stepped, queue the pending #DB intercept
+ * right now so that it an be accounted for before we execute
+ * L1's next instruction.
+ */
+ if (unlikely(svm->vmcb->save.rflags & X86_EFLAGS_TF))
+ kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
return 0;
}
+static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
+{
+ nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN);
+}
+
int svm_allocate_nested(struct vcpu_svm *svm)
{
- struct page *hsave_page;
+ struct page *vmcb02_page;
if (svm->nested.initialized)
return 0;
- hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!hsave_page)
+ vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vmcb02_page)
return -ENOMEM;
- svm->nested.hsave = page_address(hsave_page);
+ svm->nested.vmcb02.ptr = page_address(vmcb02_page);
+ svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
svm->nested.msrpm = svm_vcpu_alloc_msrpm();
if (!svm->nested.msrpm)
- goto err_free_hsave;
+ goto err_free_vmcb02;
svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);
svm->nested.initialized = true;
return 0;
-err_free_hsave:
- __free_page(hsave_page);
+err_free_vmcb02:
+ __free_page(vmcb02_page);
return -ENOMEM;
}
svm_vcpu_free_msrpm(svm->nested.msrpm);
svm->nested.msrpm = NULL;
- __free_page(virt_to_page(svm->nested.hsave));
- svm->nested.hsave = NULL;
+ __free_page(virt_to_page(svm->nested.vmcb02.ptr));
+ svm->nested.vmcb02.ptr = NULL;
svm->nested.initialized = false;
}
*/
void svm_leave_nested(struct vcpu_svm *svm)
{
- if (is_guest_mode(&svm->vcpu)) {
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ if (is_guest_mode(vcpu)) {
svm->nested.nested_run_pending = 0;
- leave_guest_mode(&svm->vcpu);
- copy_vmcb_control_area(&vmcb->control, &hsave->control);
- nested_svm_uninit_mmu_context(&svm->vcpu);
+ leave_guest_mode(vcpu);
+
+ svm_switch_vmcb(svm, &svm->nested.vmcb02);
+
+ nested_svm_uninit_mmu_context(vcpu);
vmcb_mark_all_dirty(svm->vmcb);
}
- kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
}
static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
return vmexit;
}
-int nested_svm_check_permissions(struct vcpu_svm *svm)
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
{
- if (!(svm->vcpu.arch.efer & EFER_SVME) ||
- !is_paging(&svm->vcpu)) {
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
- if (svm->vmcb->save.cpl) {
- kvm_inject_gp(&svm->vcpu, 0);
+ if (to_svm(vcpu)->vmcb->save.cpl) {
+ kvm_inject_gp(vcpu, 0);
return 1;
}
nested_svm_vmexit(svm);
}
-static void nested_svm_smi(struct vcpu_svm *svm)
-{
- svm->vmcb->control.exit_code = SVM_EXIT_SMI;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
-}
-
-static void nested_svm_nmi(struct vcpu_svm *svm)
-{
- svm->vmcb->control.exit_code = SVM_EXIT_NMI;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
-}
-
-static void nested_svm_intr(struct vcpu_svm *svm)
-{
- trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
-
- svm->vmcb->control.exit_code = SVM_EXIT_INTR;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
-}
-
static inline bool nested_exit_on_init(struct vcpu_svm *svm)
{
return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
}
-static void nested_svm_init(struct vcpu_svm *svm)
-{
- svm->vmcb->control.exit_code = SVM_EXIT_INIT;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
-}
-
-
static int svm_check_nested_events(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
return -EBUSY;
if (!nested_exit_on_init(svm))
return 0;
- nested_svm_init(svm);
+ nested_svm_simple_vmexit(svm, SVM_EXIT_INIT);
return 0;
}
if (vcpu->arch.exception.pending) {
- if (block_nested_events)
+ /*
+ * Only a pending nested run can block a pending exception.
+ * Otherwise an injected NMI/interrupt should either be
+ * lost or delivered to the nested hypervisor in the EXITINTINFO
+ * vmcb field, while delivering the pending exception.
+ */
+ if (svm->nested.nested_run_pending)
return -EBUSY;
if (!nested_exit_on_exception(svm))
return 0;
return -EBUSY;
if (!nested_exit_on_smi(svm))
return 0;
- nested_svm_smi(svm);
+ nested_svm_simple_vmexit(svm, SVM_EXIT_SMI);
return 0;
}
return -EBUSY;
if (!nested_exit_on_nmi(svm))
return 0;
- nested_svm_nmi(svm);
+ nested_svm_simple_vmexit(svm, SVM_EXIT_NMI);
return 0;
}
return -EBUSY;
if (!nested_exit_on_intr(svm))
return 0;
- nested_svm_intr(svm);
+ trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+ nested_svm_simple_vmexit(svm, SVM_EXIT_INTR);
return 0;
}
case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
- if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] &
- excp_bits)
+ if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] &
+ excp_bits)
return NESTED_EXIT_HOST;
else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
svm->vcpu.arch.apf.host_apf_flags)
if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
sizeof(user_vmcb->control)))
return -EFAULT;
- if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save,
+ if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save,
sizeof(user_vmcb->save)))
return -EFAULT;
-
out:
return kvm_state.size;
}
struct kvm_nested_state *kvm_state)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *hsave = svm->nested.hsave;
struct vmcb __user *user_vmcb = (struct vmcb __user *)
&user_kvm_nested_state->data.svm[0];
struct vmcb_control_area *ctl;
return -EINVAL;
ret = -ENOMEM;
- ctl = kzalloc(sizeof(*ctl), GFP_KERNEL);
- save = kzalloc(sizeof(*save), GFP_KERNEL);
+ ctl = kzalloc(sizeof(*ctl), GFP_KERNEL_ACCOUNT);
+ save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT);
if (!ctl || !save)
goto out_free;
goto out_free;
ret = -EINVAL;
- if (!nested_vmcb_check_controls(ctl))
+ if (!nested_vmcb_check_controls(vcpu, ctl))
goto out_free;
/*
* Processor state contains L2 state. Check that it is
- * valid for guest mode (see nested_vmcb_checks).
+ * valid for guest mode (see nested_vmcb_check_save).
*/
cr0 = kvm_read_cr0(vcpu);
if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
/*
* Validate host state saved from before VMRUN (see
* nested_svm_check_permissions).
- * TODO: validate reserved bits for all saved state.
*/
- if (!(save->cr0 & X86_CR0_PG))
- goto out_free;
- if (!(save->efer & EFER_SVME))
+ if (!(save->cr0 & X86_CR0_PG) ||
+ !(save->cr0 & X86_CR0_PE) ||
+ (save->rflags & X86_EFLAGS_VM) ||
+ !nested_vmcb_valid_sregs(vcpu, save))
goto out_free;
/*
- * All checks done, we can enter guest mode. L1 control fields
- * come from the nested save state. Guest state is already
- * in the registers, the save area of the nested state instead
- * contains saved L1 state.
+ * All checks done, we can enter guest mode. Userspace provides
+ * vmcb12.control, which will be combined with L1 and stored into
+ * vmcb02, and the L1 save state which we store in vmcb01.
+ * L2 registers if needed are moved from the current VMCB to VMCB02.
*/
svm->nested.nested_run_pending =
!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
- copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
- hsave->save = *save;
-
svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
- load_nested_vmcb_control(svm, ctl);
- nested_prepare_vmcb_control(svm);
+ if (svm->current_vmcb == &svm->vmcb01)
+ svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
+
+ svm->vmcb01.ptr->save.es = save->es;
+ svm->vmcb01.ptr->save.cs = save->cs;
+ svm->vmcb01.ptr->save.ss = save->ss;
+ svm->vmcb01.ptr->save.ds = save->ds;
+ svm->vmcb01.ptr->save.gdtr = save->gdtr;
+ svm->vmcb01.ptr->save.idtr = save->idtr;
+ svm->vmcb01.ptr->save.rflags = save->rflags | X86_EFLAGS_FIXED;
+ svm->vmcb01.ptr->save.efer = save->efer;
+ svm->vmcb01.ptr->save.cr0 = save->cr0;
+ svm->vmcb01.ptr->save.cr3 = save->cr3;
+ svm->vmcb01.ptr->save.cr4 = save->cr4;
+ svm->vmcb01.ptr->save.rax = save->rax;
+ svm->vmcb01.ptr->save.rsp = save->rsp;
+ svm->vmcb01.ptr->save.rip = save->rip;
+ svm->vmcb01.ptr->save.cpl = 0;
+
+ nested_load_control_from_vmcb12(svm, ctl);
+
+ svm_switch_vmcb(svm, &svm->nested.vmcb02);
+
+ nested_vmcb02_prepare_control(svm);
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
ret = 0;
return ret;
}
+static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (WARN_ON(!is_guest_mode(vcpu)))
+ return true;
+
+ if (nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
+ nested_npt_enabled(svm)))
+ return false;
+
+ if (!nested_svm_vmrun_msrpm(svm)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return false;
+ }
+
+ return true;
+}
+
struct kvm_x86_nested_ops svm_nested_ops = {
.check_events = svm_check_nested_events,
+ .triple_fault = nested_svm_triple_fault,
.get_nested_state_pages = svm_get_nested_state_pages,
.get_state = svm_get_nested_state,
.set_state = svm_set_nested_state,
static DEFINE_MUTEX(sev_bitmap_lock);
unsigned int max_sev_asid;
static unsigned int min_sev_asid;
+static unsigned long sev_me_mask;
static unsigned long *sev_asid_bitmap;
static unsigned long *sev_reclaim_asid_bitmap;
return ret;
}
+static inline bool is_mirroring_enc_context(struct kvm *kvm)
+{
+ return !!to_kvm_svm(kvm)->sev_info.enc_context_owner;
+}
+
/* Must be called with the sev_bitmap_lock held */
static bool __sev_recycle_asids(int min_asid, int max_asid)
{
static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
{
- struct sev_data_decommission *decommission;
- struct sev_data_deactivate *data;
+ struct sev_data_decommission decommission;
+ struct sev_data_deactivate deactivate;
if (!handle)
return;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- return;
-
- /* deactivate handle */
- data->handle = handle;
+ deactivate.handle = handle;
/* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
down_read(&sev_deactivate_lock);
- sev_guest_deactivate(data, NULL);
+ sev_guest_deactivate(&deactivate, NULL);
up_read(&sev_deactivate_lock);
- kfree(data);
-
- decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
- if (!decommission)
- return;
-
/* decommission handle */
- decommission->handle = handle;
- sev_guest_decommission(decommission, NULL);
-
- kfree(decommission);
+ decommission.handle = handle;
+ sev_guest_decommission(&decommission, NULL);
}
static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ bool es_active = argp->id == KVM_SEV_ES_INIT;
int asid, ret;
+ if (kvm->created_vcpus)
+ return -EINVAL;
+
ret = -EBUSY;
if (unlikely(sev->active))
return ret;
+ sev->es_active = es_active;
asid = sev_asid_new(sev);
if (asid < 0)
- return ret;
+ goto e_no_asid;
sev->asid = asid;
ret = sev_platform_init(&argp->error);
goto e_free;
sev->active = true;
+ sev->asid = asid;
INIT_LIST_HEAD(&sev->regions_list);
return 0;
e_free:
sev_asid_free(sev);
sev->asid = 0;
+e_no_asid:
+ sev->es_active = false;
return ret;
}
-static int sev_es_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
-{
- if (!sev_es)
- return -ENOTTY;
-
- to_kvm_svm(kvm)->sev_info.es_active = true;
-
- return sev_guest_init(kvm, argp);
-}
-
static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
{
- struct sev_data_activate *data;
+ struct sev_data_activate activate;
int asid = sev_get_asid(kvm);
int ret;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
-
/* activate ASID on the given handle */
- data->handle = handle;
- data->asid = asid;
- ret = sev_guest_activate(data, error);
- kfree(data);
+ activate.handle = handle;
+ activate.asid = asid;
+ ret = sev_guest_activate(&activate, error);
return ret;
}
static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_start *start;
+ struct sev_data_launch_start start;
struct kvm_sev_launch_start params;
void *dh_blob, *session_blob;
int *error = &argp->error;
if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params)))
return -EFAULT;
- start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT);
- if (!start)
- return -ENOMEM;
+ memset(&start, 0, sizeof(start));
dh_blob = NULL;
if (params.dh_uaddr) {
dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
- if (IS_ERR(dh_blob)) {
- ret = PTR_ERR(dh_blob);
- goto e_free;
- }
+ if (IS_ERR(dh_blob))
+ return PTR_ERR(dh_blob);
- start->dh_cert_address = __sme_set(__pa(dh_blob));
- start->dh_cert_len = params.dh_len;
+ start.dh_cert_address = __sme_set(__pa(dh_blob));
+ start.dh_cert_len = params.dh_len;
}
session_blob = NULL;
goto e_free_dh;
}
- start->session_address = __sme_set(__pa(session_blob));
- start->session_len = params.session_len;
+ start.session_address = __sme_set(__pa(session_blob));
+ start.session_len = params.session_len;
}
- start->handle = params.handle;
- start->policy = params.policy;
+ start.handle = params.handle;
+ start.policy = params.policy;
/* create memory encryption context */
- ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, &start, error);
if (ret)
goto e_free_session;
/* Bind ASID to this guest */
- ret = sev_bind_asid(kvm, start->handle, error);
+ ret = sev_bind_asid(kvm, start.handle, error);
if (ret)
goto e_free_session;
/* return handle to userspace */
- params.handle = start->handle;
+ params.handle = start.handle;
if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) {
- sev_unbind_asid(kvm, start->handle);
+ sev_unbind_asid(kvm, start.handle);
ret = -EFAULT;
goto e_free_session;
}
- sev->handle = start->handle;
+ sev->handle = start.handle;
sev->fd = argp->sev_fd;
e_free_session:
kfree(session_blob);
e_free_dh:
kfree(dh_blob);
-e_free:
- kfree(start);
return ret;
}
unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_launch_update_data params;
- struct sev_data_launch_update_data *data;
+ struct sev_data_launch_update_data data;
struct page **inpages;
int ret;
if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
-
vaddr = params.uaddr;
size = params.len;
vaddr_end = vaddr + size;
/* Lock the user memory. */
inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
- if (IS_ERR(inpages)) {
- ret = PTR_ERR(inpages);
- goto e_free;
- }
+ if (IS_ERR(inpages))
+ return PTR_ERR(inpages);
/*
* Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in
*/
sev_clflush_pages(inpages, npages);
+ data.reserved = 0;
+ data.handle = sev->handle;
+
for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
int offset, len;
len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
- data->handle = sev->handle;
- data->len = len;
- data->address = __sme_page_pa(inpages[i]) + offset;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error);
+ data.len = len;
+ data.address = __sme_page_pa(inpages[i]) + offset;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error);
if (ret)
goto e_unpin;
}
/* unlock the user pages */
sev_unpin_memory(kvm, inpages, npages);
-e_free:
- kfree(data);
return ret;
}
static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_update_vmsa *vmsa;
+ struct sev_data_launch_update_vmsa vmsa;
+ struct kvm_vcpu *vcpu;
int i, ret;
if (!sev_es_guest(kvm))
return -ENOTTY;
- vmsa = kzalloc(sizeof(*vmsa), GFP_KERNEL);
- if (!vmsa)
- return -ENOMEM;
+ vmsa.reserved = 0;
- for (i = 0; i < kvm->created_vcpus; i++) {
- struct vcpu_svm *svm = to_svm(kvm->vcpus[i]);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct vcpu_svm *svm = to_svm(vcpu);
/* Perform some pre-encryption checks against the VMSA */
ret = sev_es_sync_vmsa(svm);
if (ret)
- goto e_free;
+ return ret;
/*
* The LAUNCH_UPDATE_VMSA command will perform in-place
*/
clflush_cache_range(svm->vmsa, PAGE_SIZE);
- vmsa->handle = sev->handle;
- vmsa->address = __sme_pa(svm->vmsa);
- vmsa->len = PAGE_SIZE;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, vmsa,
+ vmsa.handle = sev->handle;
+ vmsa.address = __sme_pa(svm->vmsa);
+ vmsa.len = PAGE_SIZE;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa,
&argp->error);
if (ret)
- goto e_free;
+ return ret;
svm->vcpu.arch.guest_state_protected = true;
}
-e_free:
- kfree(vmsa);
- return ret;
+ return 0;
}
static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
void __user *measure = (void __user *)(uintptr_t)argp->data;
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_measure *data;
+ struct sev_data_launch_measure data;
struct kvm_sev_launch_measure params;
void __user *p = NULL;
void *blob = NULL;
if (copy_from_user(¶ms, measure, sizeof(params)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
+ memset(&data, 0, sizeof(data));
/* User wants to query the blob length */
if (!params.len)
p = (void __user *)(uintptr_t)params.uaddr;
if (p) {
- if (params.len > SEV_FW_BLOB_MAX_SIZE) {
- ret = -EINVAL;
- goto e_free;
- }
+ if (params.len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
- ret = -ENOMEM;
- blob = kmalloc(params.len, GFP_KERNEL);
+ blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT);
if (!blob)
- goto e_free;
+ return -ENOMEM;
- data->address = __psp_pa(blob);
- data->len = params.len;
+ data.address = __psp_pa(blob);
+ data.len = params.len;
}
cmd:
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error);
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error);
/*
* If we query the session length, FW responded with expected data.
}
done:
- params.len = data->len;
+ params.len = data.len;
if (copy_to_user(measure, ¶ms, sizeof(params)))
ret = -EFAULT;
e_free_blob:
kfree(blob);
-e_free:
- kfree(data);
return ret;
}
static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_finish *data;
- int ret;
+ struct sev_data_launch_finish data;
if (!sev_guest(kvm))
return -ENOTTY;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
-
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error);
-
- kfree(data);
- return ret;
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error);
}
static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_guest_status params;
- struct sev_data_guest_status *data;
+ struct sev_data_guest_status data;
int ret;
if (!sev_guest(kvm))
return -ENOTTY;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
+ memset(&data, 0, sizeof(data));
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error);
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error);
if (ret)
- goto e_free;
+ return ret;
- params.policy = data->policy;
- params.state = data->state;
- params.handle = data->handle;
+ params.policy = data.policy;
+ params.state = data.state;
+ params.handle = data.handle;
if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params)))
ret = -EFAULT;
-e_free:
- kfree(data);
+
return ret;
}
int *error, bool enc)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_dbg *data;
- int ret;
-
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
+ struct sev_data_dbg data;
- data->handle = sev->handle;
- data->dst_addr = dst;
- data->src_addr = src;
- data->len = size;
+ data.reserved = 0;
+ data.handle = sev->handle;
+ data.dst_addr = dst;
+ data.src_addr = src;
+ data.len = size;
- ret = sev_issue_cmd(kvm,
- enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
- data, error);
- kfree(data);
- return ret;
+ return sev_issue_cmd(kvm,
+ enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
+ &data, error);
}
static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_secret *data;
+ struct sev_data_launch_secret data;
struct kvm_sev_launch_secret params;
struct page **pages;
void *blob, *hdr;
goto e_unpin_memory;
}
- ret = -ENOMEM;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- goto e_unpin_memory;
+ memset(&data, 0, sizeof(data));
offset = params.guest_uaddr & (PAGE_SIZE - 1);
- data->guest_address = __sme_page_pa(pages[0]) + offset;
- data->guest_len = params.guest_len;
+ data.guest_address = __sme_page_pa(pages[0]) + offset;
+ data.guest_len = params.guest_len;
blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
if (IS_ERR(blob)) {
ret = PTR_ERR(blob);
- goto e_free;
+ goto e_unpin_memory;
}
- data->trans_address = __psp_pa(blob);
- data->trans_len = params.trans_len;
+ data.trans_address = __psp_pa(blob);
+ data.trans_len = params.trans_len;
hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
if (IS_ERR(hdr)) {
ret = PTR_ERR(hdr);
goto e_free_blob;
}
- data->hdr_address = __psp_pa(hdr);
- data->hdr_len = params.hdr_len;
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error);
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error);
kfree(hdr);
e_free_blob:
kfree(blob);
-e_free:
- kfree(data);
e_unpin_memory:
/* content of memory is updated, mark pages dirty */
for (i = 0; i < n; i++) {
{
void __user *report = (void __user *)(uintptr_t)argp->data;
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_attestation_report *data;
+ struct sev_data_attestation_report data;
struct kvm_sev_attestation_report params;
void __user *p;
void *blob = NULL;
if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
+ memset(&data, 0, sizeof(data));
/* User wants to query the blob length */
if (!params.len)
p = (void __user *)(uintptr_t)params.uaddr;
if (p) {
- if (params.len > SEV_FW_BLOB_MAX_SIZE) {
- ret = -EINVAL;
- goto e_free;
- }
+ if (params.len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
- ret = -ENOMEM;
- blob = kmalloc(params.len, GFP_KERNEL);
+ blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT);
if (!blob)
- goto e_free;
+ return -ENOMEM;
- data->address = __psp_pa(blob);
- data->len = params.len;
- memcpy(data->mnonce, params.mnonce, sizeof(params.mnonce));
+ data.address = __psp_pa(blob);
+ data.len = params.len;
+ memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce));
}
cmd:
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, data, &argp->error);
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error);
/*
* If we query the session length, FW responded with expected data.
*/
}
done:
- params.len = data->len;
+ params.len = data.len;
if (copy_to_user(report, ¶ms, sizeof(params)))
ret = -EFAULT;
e_free_blob:
kfree(blob);
-e_free:
- kfree(data);
return ret;
}
+/* Userspace wants to query session length. */
+static int
+__sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_send_start *params)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_start data;
+ int ret;
+
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+ if (ret < 0)
+ return ret;
+
+ params->session_len = data.session_len;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+ sizeof(struct kvm_sev_send_start)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_start data;
+ struct kvm_sev_send_start params;
+ void *amd_certs, *session_data;
+ void *pdh_cert, *plat_certs;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_send_start)))
+ return -EFAULT;
+
+ /* if session_len is zero, userspace wants to query the session length */
+ if (!params.session_len)
+ return __sev_send_start_query_session_length(kvm, argp,
+ ¶ms);
+
+ /* some sanity checks */
+ if (!params.pdh_cert_uaddr || !params.pdh_cert_len ||
+ !params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
+
+ /* allocate the memory to hold the session data blob */
+ session_data = kmalloc(params.session_len, GFP_KERNEL_ACCOUNT);
+ if (!session_data)
+ return -ENOMEM;
+
+ /* copy the certificate blobs from userspace */
+ pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr,
+ params.pdh_cert_len);
+ if (IS_ERR(pdh_cert)) {
+ ret = PTR_ERR(pdh_cert);
+ goto e_free_session;
+ }
+
+ plat_certs = psp_copy_user_blob(params.plat_certs_uaddr,
+ params.plat_certs_len);
+ if (IS_ERR(plat_certs)) {
+ ret = PTR_ERR(plat_certs);
+ goto e_free_pdh;
+ }
+
+ amd_certs = psp_copy_user_blob(params.amd_certs_uaddr,
+ params.amd_certs_len);
+ if (IS_ERR(amd_certs)) {
+ ret = PTR_ERR(amd_certs);
+ goto e_free_plat_cert;
+ }
+
+ /* populate the FW SEND_START field with system physical address */
+ memset(&data, 0, sizeof(data));
+ data.pdh_cert_address = __psp_pa(pdh_cert);
+ data.pdh_cert_len = params.pdh_cert_len;
+ data.plat_certs_address = __psp_pa(plat_certs);
+ data.plat_certs_len = params.plat_certs_len;
+ data.amd_certs_address = __psp_pa(amd_certs);
+ data.amd_certs_len = params.amd_certs_len;
+ data.session_address = __psp_pa(session_data);
+ data.session_len = params.session_len;
+ data.handle = sev->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+
+ if (!ret && copy_to_user((void __user *)(uintptr_t)params.session_uaddr,
+ session_data, params.session_len)) {
+ ret = -EFAULT;
+ goto e_free_amd_cert;
+ }
+
+ params.policy = data.policy;
+ params.session_len = data.session_len;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms,
+ sizeof(struct kvm_sev_send_start)))
+ ret = -EFAULT;
+
+e_free_amd_cert:
+ kfree(amd_certs);
+e_free_plat_cert:
+ kfree(plat_certs);
+e_free_pdh:
+ kfree(pdh_cert);
+e_free_session:
+ kfree(session_data);
+ return ret;
+}
+
+/* Userspace wants to query either header or trans length. */
+static int
+__sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_send_update_data *params)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_update_data data;
+ int ret;
+
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+ if (ret < 0)
+ return ret;
+
+ params->hdr_len = data.hdr_len;
+ params->trans_len = data.trans_len;
+
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+ sizeof(struct kvm_sev_send_update_data)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_update_data data;
+ struct kvm_sev_send_update_data params;
+ void *hdr, *trans_data;
+ struct page **guest_page;
+ unsigned long n;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_send_update_data)))
+ return -EFAULT;
+
+ /* userspace wants to query either header or trans length */
+ if (!params.trans_len || !params.hdr_len)
+ return __sev_send_update_data_query_lengths(kvm, argp, ¶ms);
+
+ if (!params.trans_uaddr || !params.guest_uaddr ||
+ !params.guest_len || !params.hdr_uaddr)
+ return -EINVAL;
+
+ /* Check if we are crossing the page boundary */
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ if ((params.guest_len + offset > PAGE_SIZE))
+ return -EINVAL;
+
+ /* Pin guest memory */
+ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+ PAGE_SIZE, &n, 0);
+ if (!guest_page)
+ return -EFAULT;
+
+ /* allocate memory for header and transport buffer */
+ ret = -ENOMEM;
+ hdr = kmalloc(params.hdr_len, GFP_KERNEL_ACCOUNT);
+ if (!hdr)
+ goto e_unpin;
+
+ trans_data = kmalloc(params.trans_len, GFP_KERNEL_ACCOUNT);
+ if (!trans_data)
+ goto e_free_hdr;
+
+ memset(&data, 0, sizeof(data));
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
+ data.trans_address = __psp_pa(trans_data);
+ data.trans_len = params.trans_len;
+
+ /* The SEND_UPDATE_DATA command requires C-bit to be always set. */
+ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+ data.guest_address |= sev_me_mask;
+ data.guest_len = params.guest_len;
+ data.handle = sev->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+
+ if (ret)
+ goto e_free_trans_data;
+
+ /* copy transport buffer to user space */
+ if (copy_to_user((void __user *)(uintptr_t)params.trans_uaddr,
+ trans_data, params.trans_len)) {
+ ret = -EFAULT;
+ goto e_free_trans_data;
+ }
+
+ /* Copy packet header to userspace. */
+ ret = copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr,
+ params.hdr_len);
+
+e_free_trans_data:
+ kfree(trans_data);
+e_free_hdr:
+ kfree(hdr);
+e_unpin:
+ sev_unpin_memory(kvm, guest_page, n);
+
+ return ret;
+}
+
+static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_finish data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error);
+}
+
+static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_cancel data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error);
+}
+
+static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_receive_start start;
+ struct kvm_sev_receive_start params;
+ int *error = &argp->error;
+ void *session_data;
+ void *pdh_data;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ /* Get parameter from the userspace */
+ if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_receive_start)))
+ return -EFAULT;
+
+ /* some sanity checks */
+ if (!params.pdh_uaddr || !params.pdh_len ||
+ !params.session_uaddr || !params.session_len)
+ return -EINVAL;
+
+ pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len);
+ if (IS_ERR(pdh_data))
+ return PTR_ERR(pdh_data);
+
+ session_data = psp_copy_user_blob(params.session_uaddr,
+ params.session_len);
+ if (IS_ERR(session_data)) {
+ ret = PTR_ERR(session_data);
+ goto e_free_pdh;
+ }
+
+ memset(&start, 0, sizeof(start));
+ start.handle = params.handle;
+ start.policy = params.policy;
+ start.pdh_cert_address = __psp_pa(pdh_data);
+ start.pdh_cert_len = params.pdh_len;
+ start.session_address = __psp_pa(session_data);
+ start.session_len = params.session_len;
+
+ /* create memory encryption context */
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, &start,
+ error);
+ if (ret)
+ goto e_free_session;
+
+ /* Bind ASID to this guest */
+ ret = sev_bind_asid(kvm, start.handle, error);
+ if (ret)
+ goto e_free_session;
+
+ params.handle = start.handle;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data,
+ ¶ms, sizeof(struct kvm_sev_receive_start))) {
+ ret = -EFAULT;
+ sev_unbind_asid(kvm, start.handle);
+ goto e_free_session;
+ }
+
+ sev->handle = start.handle;
+ sev->fd = argp->sev_fd;
+
+e_free_session:
+ kfree(session_data);
+e_free_pdh:
+ kfree(pdh_data);
+
+ return ret;
+}
+
+static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_receive_update_data params;
+ struct sev_data_receive_update_data data;
+ void *hdr = NULL, *trans = NULL;
+ struct page **guest_page;
+ unsigned long n;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -EINVAL;
+
+ if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_receive_update_data)))
+ return -EFAULT;
+
+ if (!params.hdr_uaddr || !params.hdr_len ||
+ !params.guest_uaddr || !params.guest_len ||
+ !params.trans_uaddr || !params.trans_len)
+ return -EINVAL;
+
+ /* Check if we are crossing the page boundary */
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ if ((params.guest_len + offset > PAGE_SIZE))
+ return -EINVAL;
+
+ hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+ if (IS_ERR(hdr))
+ return PTR_ERR(hdr);
+
+ trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto e_free_hdr;
+ }
+
+ memset(&data, 0, sizeof(data));
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
+ data.trans_address = __psp_pa(trans);
+ data.trans_len = params.trans_len;
+
+ /* Pin guest memory */
+ ret = -EFAULT;
+ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+ PAGE_SIZE, &n, 0);
+ if (!guest_page)
+ goto e_free_trans;
+
+ /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
+ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+ data.guest_address |= sev_me_mask;
+ data.guest_len = params.guest_len;
+ data.handle = sev->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
+ &argp->error);
+
+ sev_unpin_memory(kvm, guest_page, n);
+
+e_free_trans:
+ kfree(trans);
+e_free_hdr:
+ kfree(hdr);
+
+ return ret;
+}
+
+static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_receive_finish data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error);
+}
+
int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
{
struct kvm_sev_cmd sev_cmd;
mutex_lock(&kvm->lock);
+ /* enc_context_owner handles all memory enc operations */
+ if (is_mirroring_enc_context(kvm)) {
+ r = -EINVAL;
+ goto out;
+ }
+
switch (sev_cmd.id) {
+ case KVM_SEV_ES_INIT:
+ if (!sev_es) {
+ r = -ENOTTY;
+ goto out;
+ }
+ fallthrough;
case KVM_SEV_INIT:
r = sev_guest_init(kvm, &sev_cmd);
break;
- case KVM_SEV_ES_INIT:
- r = sev_es_guest_init(kvm, &sev_cmd);
- break;
case KVM_SEV_LAUNCH_START:
r = sev_launch_start(kvm, &sev_cmd);
break;
case KVM_SEV_GET_ATTESTATION_REPORT:
r = sev_get_attestation_report(kvm, &sev_cmd);
break;
+ case KVM_SEV_SEND_START:
+ r = sev_send_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_UPDATE_DATA:
+ r = sev_send_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_FINISH:
+ r = sev_send_finish(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_CANCEL:
+ r = sev_send_cancel(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_START:
+ r = sev_receive_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_UPDATE_DATA:
+ r = sev_receive_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_FINISH:
+ r = sev_receive_finish(kvm, &sev_cmd);
+ break;
default:
r = -EINVAL;
goto out;
if (!sev_guest(kvm))
return -ENOTTY;
+ /* If kvm is mirroring encryption context it isn't responsible for it */
+ if (is_mirroring_enc_context(kvm))
+ return -EINVAL;
+
if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
return -EINVAL;
struct enc_region *region;
int ret;
+ /* If kvm is mirroring encryption context it isn't responsible for it */
+ if (is_mirroring_enc_context(kvm))
+ return -EINVAL;
+
mutex_lock(&kvm->lock);
if (!sev_guest(kvm)) {
return ret;
}
+int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
+{
+ struct file *source_kvm_file;
+ struct kvm *source_kvm;
+ struct kvm_sev_info *mirror_sev;
+ unsigned int asid;
+ int ret;
+
+ source_kvm_file = fget(source_fd);
+ if (!file_is_kvm(source_kvm_file)) {
+ ret = -EBADF;
+ goto e_source_put;
+ }
+
+ source_kvm = source_kvm_file->private_data;
+ mutex_lock(&source_kvm->lock);
+
+ if (!sev_guest(source_kvm)) {
+ ret = -EINVAL;
+ goto e_source_unlock;
+ }
+
+ /* Mirrors of mirrors should work, but let's not get silly */
+ if (is_mirroring_enc_context(source_kvm) || source_kvm == kvm) {
+ ret = -EINVAL;
+ goto e_source_unlock;
+ }
+
+ asid = to_kvm_svm(source_kvm)->sev_info.asid;
+
+ /*
+ * The mirror kvm holds an enc_context_owner ref so its asid can't
+ * disappear until we're done with it
+ */
+ kvm_get_kvm(source_kvm);
+
+ fput(source_kvm_file);
+ mutex_unlock(&source_kvm->lock);
+ mutex_lock(&kvm->lock);
+
+ if (sev_guest(kvm)) {
+ ret = -EINVAL;
+ goto e_mirror_unlock;
+ }
+
+ /* Set enc_context_owner and copy its encryption context over */
+ mirror_sev = &to_kvm_svm(kvm)->sev_info;
+ mirror_sev->enc_context_owner = source_kvm;
+ mirror_sev->asid = asid;
+ mirror_sev->active = true;
+
+ mutex_unlock(&kvm->lock);
+ return 0;
+
+e_mirror_unlock:
+ mutex_unlock(&kvm->lock);
+ kvm_put_kvm(source_kvm);
+ return ret;
+e_source_unlock:
+ mutex_unlock(&source_kvm->lock);
+e_source_put:
+ fput(source_kvm_file);
+ return ret;
+}
+
void sev_vm_destroy(struct kvm *kvm)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
if (!sev_guest(kvm))
return;
+ /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
+ if (is_mirroring_enc_context(kvm)) {
+ kvm_put_kvm(sev->enc_context_owner);
+ return;
+ }
+
mutex_lock(&kvm->lock);
/*
/* Minimum ASID value that should be used for SEV guest */
min_sev_asid = edx;
+ sev_me_mask = 1UL << (ebx & 0x3f);
/* Initialize SEV ASID bitmaps */
sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
len, GHCB_SCRATCH_AREA_LIMIT);
return false;
}
- scratch_va = kzalloc(len, GFP_KERNEL);
+ scratch_va = kzalloc(len, GFP_KERNEL_ACCOUNT);
if (!scratch_va)
return false;
vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn;
vcpu->arch.regs[VCPU_REGS_RCX] = 0;
- ret = svm_invoke_exit_handler(svm, SVM_EXIT_CPUID);
+ ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID);
if (!ret) {
ret = -EINVAL;
break;
return ret;
}
-int sev_handle_vmgexit(struct vcpu_svm *svm)
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_control_area *control = &svm->vmcb->control;
u64 ghcb_gpa, exit_code;
struct ghcb *ghcb;
return sev_handle_vmgexit_msr_protocol(svm);
if (!ghcb_gpa) {
- vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB gpa is not set\n");
+ vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n");
return -EINVAL;
}
- if (kvm_vcpu_map(&svm->vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) {
+ if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) {
/* Unable to map GHCB from guest */
- vcpu_unimpl(&svm->vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
+ vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
ghcb_gpa);
return -EINVAL;
}
svm->ghcb = svm->ghcb_map.hva;
ghcb = svm->ghcb_map.hva;
- trace_kvm_vmgexit_enter(svm->vcpu.vcpu_id, ghcb);
+ trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb);
exit_code = ghcb_get_sw_exit_code(ghcb);
if (!setup_vmgexit_scratch(svm, true, control->exit_info_2))
break;
- ret = kvm_sev_es_mmio_read(&svm->vcpu,
+ ret = kvm_sev_es_mmio_read(vcpu,
control->exit_info_1,
control->exit_info_2,
svm->ghcb_sa);
if (!setup_vmgexit_scratch(svm, false, control->exit_info_2))
break;
- ret = kvm_sev_es_mmio_write(&svm->vcpu,
+ ret = kvm_sev_es_mmio_write(vcpu,
control->exit_info_1,
control->exit_info_2,
svm->ghcb_sa);
break;
case SVM_VMGEXIT_NMI_COMPLETE:
- ret = svm_invoke_exit_handler(svm, SVM_EXIT_IRET);
+ ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET);
break;
case SVM_VMGEXIT_AP_HLT_LOOP:
- ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
+ ret = kvm_emulate_ap_reset_hold(vcpu);
break;
case SVM_VMGEXIT_AP_JUMP_TABLE: {
- struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
switch (control->exit_info_1) {
case 0:
break;
}
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
- vcpu_unimpl(&svm->vcpu,
+ vcpu_unimpl(vcpu,
"vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
control->exit_info_1, control->exit_info_2);
break;
default:
- ret = svm_invoke_exit_handler(svm, exit_code);
+ ret = svm_invoke_exit_handler(vcpu, exit_code);
}
return ret;
* the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a
* non-zero value.
*/
+ if (!svm->ghcb)
+ return;
+
ghcb_set_sw_exit_info_2(svm->ghcb, 1);
}
MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
#endif
-#define IOPM_ALLOC_ORDER 2
-#define MSRPM_ALLOC_ORDER 1
-
#define SEG_TYPE_LDT 2
#define SEG_TYPE_BUSY_TSS16 3
} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
{ .index = MSR_STAR, .always = true },
{ .index = MSR_IA32_SYSENTER_CS, .always = true },
+ { .index = MSR_IA32_SYSENTER_EIP, .always = false },
+ { .index = MSR_IA32_SYSENTER_ESP, .always = false },
#ifdef CONFIG_X86_64
{ .index = MSR_GS_BASE, .always = true },
{ .index = MSR_FS_BASE, .always = true },
* In this case we will return to the nested guest
* as soon as we leave SMM.
*/
- if (!is_smm(&svm->vcpu))
+ if (!is_smm(vcpu))
svm_free_nested(svm);
} else {
bool has_error_code = vcpu->arch.exception.has_error_code;
u32 error_code = vcpu->arch.exception.error_code;
- kvm_deliver_exception_payload(&svm->vcpu);
+ kvm_deliver_exception_payload(vcpu);
if (nr == BP_VECTOR && !nrips) {
- unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
+ unsigned long rip, old_rip = kvm_rip_read(vcpu);
/*
* For guest debugging where we have to reinject #BP if some
* raises a fault that is not intercepted. Still better than
* failing in all cases.
*/
- (void)skip_emulated_instruction(&svm->vcpu);
- rip = kvm_rip_read(&svm->vcpu);
+ (void)skip_emulated_instruction(vcpu);
+ rip = kvm_rip_read(vcpu);
svm->int3_rip = rip + svm->vmcb->save.cs.base;
svm->int3_injected = rip - old_rip;
}
u32 *svm_vcpu_alloc_msrpm(void)
{
- struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
+ unsigned int order = get_order(MSRPM_SIZE);
+ struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
u32 *msrpm;
if (!pages)
return NULL;
msrpm = page_address(pages);
- memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+ memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
return msrpm;
}
void svm_vcpu_free_msrpm(u32 *msrpm)
{
- __free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER);
+ __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
}
static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
*/
mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
- kvm_mmu_set_mmio_spte_mask(mask, PT_WRITABLE_MASK | PT_USER_MASK);
+ kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
}
static void svm_hardware_teardown(void)
for_each_possible_cpu(cpu)
svm_cpu_uninit(cpu);
- __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
+ __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
+ get_order(IOPM_SIZE));
iopm_base = 0;
}
struct page *iopm_pages;
void *iopm_va;
int r;
+ unsigned int order = get_order(IOPM_SIZE);
- iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
+ iopm_pages = alloc_pages(GFP_KERNEL, order);
if (!iopm_pages)
return -ENOMEM;
iopm_va = page_address(iopm_pages);
- memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
+ memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
init_msrpm_offsets();
if (is_guest_mode(vcpu)) {
/* Write L1's TSC offset. */
g_tsc_offset = svm->vmcb->control.tsc_offset -
- svm->nested.hsave->control.tsc_offset;
- svm->nested.hsave->control.tsc_offset = offset;
+ svm->vmcb01.ptr->control.tsc_offset;
+ svm->vmcb01.ptr->control.tsc_offset = offset;
}
trace_kvm_write_tsc_offset(vcpu->vcpu_id,
}
}
-static void init_vmcb(struct vcpu_svm *svm)
+static void init_vmcb(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
- svm->vcpu.arch.hflags = 0;
+ vcpu->arch.hflags = 0;
svm_set_intercept(svm, INTERCEPT_CR0_READ);
svm_set_intercept(svm, INTERCEPT_CR3_READ);
svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
- if (!kvm_vcpu_apicv_active(&svm->vcpu))
+ if (!kvm_vcpu_apicv_active(vcpu))
svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
set_dr_intercepts(svm);
svm_set_intercept(svm, INTERCEPT_RDPRU);
svm_set_intercept(svm, INTERCEPT_RSM);
- if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
+ if (!kvm_mwait_in_guest(vcpu->kvm)) {
svm_set_intercept(svm, INTERCEPT_MONITOR);
svm_set_intercept(svm, INTERCEPT_MWAIT);
}
- if (!kvm_hlt_in_guest(svm->vcpu.kvm))
+ if (!kvm_hlt_in_guest(vcpu->kvm))
svm_set_intercept(svm, INTERCEPT_HLT);
control->iopm_base_pa = __sme_set(iopm_base);
init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
- svm_set_cr4(&svm->vcpu, 0);
- svm_set_efer(&svm->vcpu, 0);
+ svm_set_cr4(vcpu, 0);
+ svm_set_efer(vcpu, 0);
save->dr6 = 0xffff0ff0;
- kvm_set_rflags(&svm->vcpu, X86_EFLAGS_FIXED);
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
save->rip = 0x0000fff0;
- svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
+ vcpu->arch.regs[VCPU_REGS_RIP] = save->rip;
/*
* svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
* It also updates the guest-visible cr0 value.
*/
- svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
- kvm_mmu_reset_context(&svm->vcpu);
+ svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
+ kvm_mmu_reset_context(vcpu);
save->cr4 = X86_CR4_PAE;
/* rdx = ?? */
clr_exception_intercept(svm, PF_VECTOR);
svm_clr_intercept(svm, INTERCEPT_CR3_READ);
svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
- save->g_pat = svm->vcpu.arch.pat;
+ save->g_pat = vcpu->arch.pat;
save->cr3 = 0;
save->cr4 = 0;
}
- svm->asid_generation = 0;
+ svm->current_vmcb->asid_generation = 0;
svm->asid = 0;
svm->nested.vmcb12_gpa = 0;
- svm->vcpu.arch.hflags = 0;
+ svm->nested.last_vmcb12_gpa = 0;
+ vcpu->arch.hflags = 0;
- if (!kvm_pause_in_guest(svm->vcpu.kvm)) {
+ if (!kvm_pause_in_guest(vcpu->kvm)) {
control->pause_filter_count = pause_filter_count;
if (pause_filter_thresh)
control->pause_filter_thresh = pause_filter_thresh;
svm_check_invpcid(svm);
- if (kvm_vcpu_apicv_active(&svm->vcpu))
- avic_init_vmcb(svm);
-
/*
- * If hardware supports Virtual VMLOAD VMSAVE then enable it
- * in VMCB and clear intercepts to avoid #VMEXIT.
+ * If the host supports V_SPEC_CTRL then disable the interception
+ * of MSR_IA32_SPEC_CTRL.
*/
- if (vls) {
- svm_clr_intercept(svm, INTERCEPT_VMLOAD);
- svm_clr_intercept(svm, INTERCEPT_VMSAVE);
- svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
- }
+ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_init_vmcb(svm);
if (vgif) {
svm_clr_intercept(svm, INTERCEPT_STGI);
svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
}
- if (sev_guest(svm->vcpu.kvm)) {
+ if (sev_guest(vcpu->kvm)) {
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
clr_exception_intercept(svm, UD_VECTOR);
- if (sev_es_guest(svm->vcpu.kvm)) {
+ if (sev_es_guest(vcpu->kvm)) {
/* Perform SEV-ES specific VMCB updates */
sev_es_init_vmcb(svm);
}
svm->virt_spec_ctrl = 0;
if (!init_event) {
- svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
- MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
- svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_reset_bsp(vcpu))
+ vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
}
- init_vmcb(svm);
+ init_vmcb(vcpu);
kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
kvm_rdx_write(vcpu, eax);
avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
}
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
+{
+ svm->current_vmcb = target_vmcb;
+ svm->vmcb = target_vmcb->ptr;
+}
+
static int svm_create_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
- struct page *vmcb_page;
+ struct page *vmcb01_page;
struct page *vmsa_page = NULL;
int err;
svm = to_svm(vcpu);
err = -ENOMEM;
- vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!vmcb_page)
+ vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vmcb01_page)
goto out;
- if (sev_es_guest(svm->vcpu.kvm)) {
+ if (sev_es_guest(vcpu->kvm)) {
/*
* SEV-ES guests require a separate VMSA page used to contain
* the encrypted register state of the guest.
svm_vcpu_init_msrpm(vcpu, svm->msrpm);
- svm->vmcb = page_address(vmcb_page);
- svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT);
+ svm->vmcb01.ptr = page_address(vmcb01_page);
+ svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
if (vmsa_page)
svm->vmsa = page_address(vmsa_page);
- svm->asid_generation = 0;
svm->guest_state_loaded = false;
- init_vmcb(svm);
+
+ svm_switch_vmcb(svm, &svm->vmcb01);
+ init_vmcb(vcpu);
svm_init_osvw(vcpu);
vcpu->arch.microcode_version = 0x01000065;
- if (sev_es_guest(svm->vcpu.kvm))
+ if (sev_es_guest(vcpu->kvm))
/* Perform SEV-ES specific VMCB creation updates */
sev_es_create_vcpu(svm);
if (vmsa_page)
__free_page(vmsa_page);
error_free_vmcb_page:
- __free_page(vmcb_page);
+ __free_page(vmcb01_page);
out:
return err;
}
sev_free_vcpu(vcpu);
- __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
- __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
+ __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
+ __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
}
static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
* Save additional host state that will be restored on VMEXIT (sev-es)
* or subsequent vmload of host save area.
*/
- if (sev_es_guest(svm->vcpu.kvm)) {
+ if (sev_es_guest(vcpu->kvm)) {
sev_es_prepare_guest_switch(svm, vcpu->cpu);
} else {
vmsave(__sme_page_pa(sd->save_area));
struct vcpu_svm *svm = to_svm(vcpu);
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
- if (unlikely(cpu != vcpu->cpu)) {
- svm->asid_generation = 0;
- vmcb_mark_all_dirty(svm->vmcb);
- }
-
if (sd->current_vmcb != svm->vmcb) {
sd->current_vmcb = svm->vmcb;
indirect_branch_prediction_barrier();
/* Drop int_ctl fields related to VINTR injection. */
svm->vmcb->control.int_ctl &= mask;
if (is_guest_mode(&svm->vcpu)) {
- svm->nested.hsave->control.int_ctl &= mask;
+ svm->vmcb01.ptr->control.int_ctl &= mask;
WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
(svm->nested.ctl.int_ctl & V_TPR_MASK));
static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
{
struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+ struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
switch (seg) {
case VCPU_SREG_CS: return &save->cs;
case VCPU_SREG_DS: return &save->ds;
case VCPU_SREG_ES: return &save->es;
- case VCPU_SREG_FS: return &save->fs;
- case VCPU_SREG_GS: return &save->gs;
+ case VCPU_SREG_FS: return &save01->fs;
+ case VCPU_SREG_GS: return &save01->gs;
case VCPU_SREG_SS: return &save->ss;
- case VCPU_SREG_TR: return &save->tr;
- case VCPU_SREG_LDTR: return &save->ldtr;
+ case VCPU_SREG_TR: return &save01->tr;
+ case VCPU_SREG_LDTR: return &save01->ldtr;
}
BUG();
return NULL;
vmcb_mark_dirty(svm->vmcb, VMCB_DT);
}
-static void update_cr0_intercept(struct vcpu_svm *svm)
-{
- ulong gcr0;
- u64 *hcr0;
-
- /*
- * SEV-ES guests must always keep the CR intercepts cleared. CR
- * tracking is done using the CR write traps.
- */
- if (sev_es_guest(svm->vcpu.kvm))
- return;
-
- gcr0 = svm->vcpu.arch.cr0;
- hcr0 = &svm->vmcb->save.cr0;
- *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
- | (gcr0 & SVM_CR0_SELECTIVE_MASK);
-
- vmcb_mark_dirty(svm->vmcb, VMCB_CR);
-
- if (gcr0 == *hcr0) {
- svm_clr_intercept(svm, INTERCEPT_CR0_READ);
- svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
- } else {
- svm_set_intercept(svm, INTERCEPT_CR0_READ);
- svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
- }
-}
-
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ u64 hcr0 = cr0;
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
vcpu->arch.cr0 = cr0;
if (!npt_enabled)
- cr0 |= X86_CR0_PG | X86_CR0_WP;
+ hcr0 |= X86_CR0_PG | X86_CR0_WP;
/*
* re-enable caching here because the QEMU bios
* reboot
*/
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
- cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
- svm->vmcb->save.cr0 = cr0;
+ hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+
+ svm->vmcb->save.cr0 = hcr0;
vmcb_mark_dirty(svm->vmcb, VMCB_CR);
- update_cr0_intercept(svm);
+
+ /*
+ * SEV-ES guests must always keep the CR intercepts cleared. CR
+ * tracking is done using the CR write traps.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return;
+
+ if (hcr0 == cr0) {
+ /* Selective CR0 write remains on. */
+ svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
+ } else {
+ svm_set_intercept(svm, INTERCEPT_CR0_READ);
+ svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
+ }
}
static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
}
- svm->asid_generation = sd->asid_generation;
+ svm->current_vmcb->asid_generation = sd->asid_generation;
svm->asid = sd->next_asid++;
}
vmcb_mark_dirty(svm->vmcb, VMCB_DR);
}
-static int pf_interception(struct vcpu_svm *svm)
+static int pf_interception(struct kvm_vcpu *vcpu)
{
- u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ u64 fault_address = svm->vmcb->control.exit_info_2;
u64 error_code = svm->vmcb->control.exit_info_1;
- return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
+ return kvm_handle_page_fault(vcpu, error_code, fault_address,
static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
svm->vmcb->control.insn_bytes : NULL,
svm->vmcb->control.insn_len);
}
-static int npf_interception(struct vcpu_svm *svm)
+static int npf_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
u64 error_code = svm->vmcb->control.exit_info_1;
trace_kvm_page_fault(fault_address, error_code);
- return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+ return kvm_mmu_page_fault(vcpu, fault_address, error_code,
static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
svm->vmcb->control.insn_bytes : NULL,
svm->vmcb->control.insn_len);
}
-static int db_interception(struct vcpu_svm *svm)
+static int db_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_run *kvm_run = svm->vcpu.run;
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_run *kvm_run = vcpu->run;
+ struct vcpu_svm *svm = to_svm(vcpu);
- if (!(svm->vcpu.guest_debug &
+ if (!(vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
!svm->nmi_singlestep) {
u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
- kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload);
+ kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
return 1;
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
- if (svm->vcpu.guest_debug &
+ if (vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
kvm_run->exit_reason = KVM_EXIT_DEBUG;
kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
return 1;
}
-static int bp_interception(struct vcpu_svm *svm)
+static int bp_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_run *kvm_run = svm->vcpu.run;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
return 0;
}
-static int ud_interception(struct vcpu_svm *svm)
+static int ud_interception(struct kvm_vcpu *vcpu)
{
- return handle_ud(&svm->vcpu);
+ return handle_ud(vcpu);
}
-static int ac_interception(struct vcpu_svm *svm)
+static int ac_interception(struct kvm_vcpu *vcpu)
{
- kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
+ kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
return 1;
}
return true;
}
-static void svm_handle_mce(struct vcpu_svm *svm)
+static void svm_handle_mce(struct kvm_vcpu *vcpu)
{
if (is_erratum_383()) {
/*
*/
pr_err("KVM: Guest triggered AMD Erratum 383\n");
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
kvm_machine_check();
}
-static int mc_interception(struct vcpu_svm *svm)
+static int mc_interception(struct kvm_vcpu *vcpu)
{
return 1;
}
-static int shutdown_interception(struct vcpu_svm *svm)
+static int shutdown_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_run *kvm_run = svm->vcpu.run;
+ struct kvm_run *kvm_run = vcpu->run;
+ struct vcpu_svm *svm = to_svm(vcpu);
/*
* The VM save area has already been encrypted so it
* cannot be reinitialized - just terminate.
*/
- if (sev_es_guest(svm->vcpu.kvm))
+ if (sev_es_guest(vcpu->kvm))
return -EINVAL;
/*
* so reinitialize it.
*/
clear_page(svm->vmcb);
- init_vmcb(svm);
+ init_vmcb(vcpu);
kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
}
-static int io_interception(struct vcpu_svm *svm)
+static int io_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vcpu_svm *svm = to_svm(vcpu);
u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
int size, in, string;
unsigned port;
- ++svm->vcpu.stat.io_exits;
+ ++vcpu->stat.io_exits;
string = (io_info & SVM_IOIO_STR_MASK) != 0;
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
port = io_info >> 16;
svm->next_rip = svm->vmcb->control.exit_info_2;
- return kvm_fast_pio(&svm->vcpu, size, port, in);
-}
-
-static int nmi_interception(struct vcpu_svm *svm)
-{
- return 1;
+ return kvm_fast_pio(vcpu, size, port, in);
}
-static int intr_interception(struct vcpu_svm *svm)
+static int nmi_interception(struct kvm_vcpu *vcpu)
{
- ++svm->vcpu.stat.irq_exits;
return 1;
}
-static int nop_on_interception(struct vcpu_svm *svm)
+static int intr_interception(struct kvm_vcpu *vcpu)
{
+ ++vcpu->stat.irq_exits;
return 1;
}
-static int halt_interception(struct vcpu_svm *svm)
+static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
{
- return kvm_emulate_halt(&svm->vcpu);
-}
-
-static int vmmcall_interception(struct vcpu_svm *svm)
-{
- return kvm_emulate_hypercall(&svm->vcpu);
-}
-
-static int vmload_interception(struct vcpu_svm *svm)
-{
- struct vmcb *nested_vmcb;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb12;
struct kvm_host_map map;
int ret;
- if (nested_svm_check_permissions(svm))
+ if (nested_svm_check_permissions(vcpu))
return 1;
- ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+ ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
if (ret) {
if (ret == -EINVAL)
- kvm_inject_gp(&svm->vcpu, 0);
+ kvm_inject_gp(vcpu, 0);
return 1;
}
- nested_vmcb = map.hva;
+ vmcb12 = map.hva;
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
+ ret = kvm_skip_emulated_instruction(vcpu);
- nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ if (vmload) {
+ nested_svm_vmloadsave(vmcb12, svm->vmcb);
+ svm->sysenter_eip_hi = 0;
+ svm->sysenter_esp_hi = 0;
+ } else
+ nested_svm_vmloadsave(svm->vmcb, vmcb12);
+
+ kvm_vcpu_unmap(vcpu, &map, true);
return ret;
}
-static int vmsave_interception(struct vcpu_svm *svm)
+static int vmload_interception(struct kvm_vcpu *vcpu)
{
- struct vmcb *nested_vmcb;
- struct kvm_host_map map;
- int ret;
-
- if (nested_svm_check_permissions(svm))
- return 1;
-
- ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
- if (ret) {
- if (ret == -EINVAL)
- kvm_inject_gp(&svm->vcpu, 0);
- return 1;
- }
-
- nested_vmcb = map.hva;
-
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
-
- nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ return vmload_vmsave_interception(vcpu, true);
+}
- return ret;
+static int vmsave_interception(struct kvm_vcpu *vcpu)
+{
+ return vmload_vmsave_interception(vcpu, false);
}
-static int vmrun_interception(struct vcpu_svm *svm)
+static int vmrun_interception(struct kvm_vcpu *vcpu)
{
- if (nested_svm_check_permissions(svm))
+ if (nested_svm_check_permissions(vcpu))
return 1;
- return nested_svm_vmrun(svm);
+ return nested_svm_vmrun(vcpu);
}
enum {
[SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
[SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
};
- int (*const svm_instr_handlers[])(struct vcpu_svm *svm) = {
+ int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_INSTR_VMRUN] = vmrun_interception,
[SVM_INSTR_VMLOAD] = vmload_interception,
[SVM_INSTR_VMSAVE] = vmsave_interception,
int ret;
if (is_guest_mode(vcpu)) {
- svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode];
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
/* Returns '1' or -errno on failure, '0' on success. */
- ret = nested_svm_vmexit(svm);
+ ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
if (ret)
return ret;
return 1;
}
- return svm_instr_handlers[opcode](svm);
+ return svm_instr_handlers[opcode](vcpu);
}
/*
* regions (e.g. SMM memory on host).
* 2) VMware backdoor
*/
-static int gp_interception(struct vcpu_svm *svm)
+static int gp_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vcpu_svm *svm = to_svm(vcpu);
u32 error_code = svm->vmcb->control.exit_info_1;
int opcode;
}
}
-static int stgi_interception(struct vcpu_svm *svm)
+static int stgi_interception(struct kvm_vcpu *vcpu)
{
int ret;
- if (nested_svm_check_permissions(svm))
+ if (nested_svm_check_permissions(vcpu))
return 1;
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
- svm_set_gif(svm, true);
+ ret = kvm_skip_emulated_instruction(vcpu);
+ svm_set_gif(to_svm(vcpu), true);
return ret;
}
-static int clgi_interception(struct vcpu_svm *svm)
+static int clgi_interception(struct kvm_vcpu *vcpu)
{
int ret;
- if (nested_svm_check_permissions(svm))
+ if (nested_svm_check_permissions(vcpu))
return 1;
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
- svm_set_gif(svm, false);
+ ret = kvm_skip_emulated_instruction(vcpu);
+ svm_set_gif(to_svm(vcpu), false);
return ret;
}
-static int invlpga_interception(struct vcpu_svm *svm)
+static int invlpga_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
-
- trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu),
- kvm_rax_read(&svm->vcpu));
+ trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, kvm_rcx_read(vcpu),
+ kvm_rax_read(vcpu));
/* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
- kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu));
-
- return kvm_skip_emulated_instruction(&svm->vcpu);
-}
+ kvm_mmu_invlpg(vcpu, kvm_rax_read(vcpu));
-static int skinit_interception(struct vcpu_svm *svm)
-{
- trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu));
-
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
-}
-
-static int wbinvd_interception(struct vcpu_svm *svm)
-{
- return kvm_emulate_wbinvd(&svm->vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
}
-static int xsetbv_interception(struct vcpu_svm *svm)
+static int skinit_interception(struct kvm_vcpu *vcpu)
{
- u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
- u32 index = kvm_rcx_read(&svm->vcpu);
+ trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
- int err = kvm_set_xcr(&svm->vcpu, index, new_bv);
- return kvm_complete_insn_gp(&svm->vcpu, err);
-}
-
-static int rdpru_interception(struct vcpu_svm *svm)
-{
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
-static int task_switch_interception(struct vcpu_svm *svm)
+static int task_switch_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
u16 tss_selector;
int reason;
int int_type = svm->vmcb->control.exit_int_info &
if (reason == TASK_SWITCH_GATE) {
switch (type) {
case SVM_EXITINTINFO_TYPE_NMI:
- svm->vcpu.arch.nmi_injected = false;
+ vcpu->arch.nmi_injected = false;
break;
case SVM_EXITINTINFO_TYPE_EXEPT:
if (svm->vmcb->control.exit_info_2 &
error_code =
(u32)svm->vmcb->control.exit_info_2;
}
- kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_exception_queue(vcpu);
break;
case SVM_EXITINTINFO_TYPE_INTR:
- kvm_clear_interrupt_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(vcpu);
break;
default:
break;
int_type == SVM_EXITINTINFO_TYPE_SOFT ||
(int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
(int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
- if (!skip_emulated_instruction(&svm->vcpu))
+ if (!skip_emulated_instruction(vcpu))
return 0;
}
if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
int_vec = -1;
- return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
+ return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
has_error_code, error_code);
}
-static int cpuid_interception(struct vcpu_svm *svm)
+static int iret_interception(struct kvm_vcpu *vcpu)
{
- return kvm_emulate_cpuid(&svm->vcpu);
-}
+ struct vcpu_svm *svm = to_svm(vcpu);
-static int iret_interception(struct vcpu_svm *svm)
-{
- ++svm->vcpu.stat.nmi_window_exits;
- svm->vcpu.arch.hflags |= HF_IRET_MASK;
- if (!sev_es_guest(svm->vcpu.kvm)) {
+ ++vcpu->stat.nmi_window_exits;
+ vcpu->arch.hflags |= HF_IRET_MASK;
+ if (!sev_es_guest(vcpu->kvm)) {
svm_clr_intercept(svm, INTERCEPT_IRET);
- svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
+ svm->nmi_iret_rip = kvm_rip_read(vcpu);
}
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
return 1;
}
-static int invd_interception(struct vcpu_svm *svm)
-{
- /* Treat an INVD instruction as a NOP and just skip it. */
- return kvm_skip_emulated_instruction(&svm->vcpu);
-}
-
-static int invlpg_interception(struct vcpu_svm *svm)
+static int invlpg_interception(struct kvm_vcpu *vcpu)
{
if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
- return kvm_emulate_instruction(&svm->vcpu, 0);
-
- kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
- return kvm_skip_emulated_instruction(&svm->vcpu);
-}
+ return kvm_emulate_instruction(vcpu, 0);
-static int emulate_on_interception(struct vcpu_svm *svm)
-{
- return kvm_emulate_instruction(&svm->vcpu, 0);
+ kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
+ return kvm_skip_emulated_instruction(vcpu);
}
-static int rsm_interception(struct vcpu_svm *svm)
+static int emulate_on_interception(struct kvm_vcpu *vcpu)
{
- return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2);
+ return kvm_emulate_instruction(vcpu, 0);
}
-static int rdpmc_interception(struct vcpu_svm *svm)
+static int rsm_interception(struct kvm_vcpu *vcpu)
{
- int err;
-
- if (!nrips)
- return emulate_on_interception(svm);
-
- err = kvm_rdpmc(&svm->vcpu);
- return kvm_complete_insn_gp(&svm->vcpu, err);
+ return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
}
-static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
+static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
unsigned long val)
{
- unsigned long cr0 = svm->vcpu.arch.cr0;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long cr0 = vcpu->arch.cr0;
bool ret = false;
- if (!is_guest_mode(&svm->vcpu) ||
+ if (!is_guest_mode(vcpu) ||
(!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
return false;
#define CR_VALID (1ULL << 63)
-static int cr_interception(struct vcpu_svm *svm)
+static int cr_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
int reg, cr;
unsigned long val;
int err;
if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
- return emulate_on_interception(svm);
+ return emulate_on_interception(vcpu);
if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
- return emulate_on_interception(svm);
+ return emulate_on_interception(vcpu);
reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
err = 0;
if (cr >= 16) { /* mov to cr */
cr -= 16;
- val = kvm_register_read(&svm->vcpu, reg);
+ val = kvm_register_read(vcpu, reg);
trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
- if (!check_selective_cr0_intercepted(svm, val))
- err = kvm_set_cr0(&svm->vcpu, val);
+ if (!check_selective_cr0_intercepted(vcpu, val))
+ err = kvm_set_cr0(vcpu, val);
else
return 1;
break;
case 3:
- err = kvm_set_cr3(&svm->vcpu, val);
+ err = kvm_set_cr3(vcpu, val);
break;
case 4:
- err = kvm_set_cr4(&svm->vcpu, val);
+ err = kvm_set_cr4(vcpu, val);
break;
case 8:
- err = kvm_set_cr8(&svm->vcpu, val);
+ err = kvm_set_cr8(vcpu, val);
break;
default:
WARN(1, "unhandled write to CR%d", cr);
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
} else { /* mov from cr */
switch (cr) {
case 0:
- val = kvm_read_cr0(&svm->vcpu);
+ val = kvm_read_cr0(vcpu);
break;
case 2:
- val = svm->vcpu.arch.cr2;
+ val = vcpu->arch.cr2;
break;
case 3:
- val = kvm_read_cr3(&svm->vcpu);
+ val = kvm_read_cr3(vcpu);
break;
case 4:
- val = kvm_read_cr4(&svm->vcpu);
+ val = kvm_read_cr4(vcpu);
break;
case 8:
- val = kvm_get_cr8(&svm->vcpu);
+ val = kvm_get_cr8(vcpu);
break;
default:
WARN(1, "unhandled read from CR%d", cr);
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
- kvm_register_write(&svm->vcpu, reg, val);
+ kvm_register_write(vcpu, reg, val);
trace_kvm_cr_read(cr, val);
}
- return kvm_complete_insn_gp(&svm->vcpu, err);
+ return kvm_complete_insn_gp(vcpu, err);
}
-static int cr_trap(struct vcpu_svm *svm)
+static int cr_trap(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vcpu_svm *svm = to_svm(vcpu);
unsigned long old_value, new_value;
unsigned int cr;
int ret = 0;
kvm_post_set_cr4(vcpu, old_value, new_value);
break;
case 8:
- ret = kvm_set_cr8(&svm->vcpu, new_value);
+ ret = kvm_set_cr8(vcpu, new_value);
break;
default:
WARN(1, "unhandled CR%d write trap", cr);
return kvm_complete_insn_gp(vcpu, ret);
}
-static int dr_interception(struct vcpu_svm *svm)
+static int dr_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
int reg, dr;
unsigned long val;
int err = 0;
- if (svm->vcpu.guest_debug == 0) {
+ if (vcpu->guest_debug == 0) {
/*
* No more DR vmexits; force a reload of the debug registers
* and reenter on this instruction. The next vmexit will
* retrieve the full state of the debug registers.
*/
clr_dr_intercepts(svm);
- svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
return 1;
}
if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
- return emulate_on_interception(svm);
+ return emulate_on_interception(vcpu);
reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
if (dr >= 16) { /* mov to DRn */
dr -= 16;
- val = kvm_register_read(&svm->vcpu, reg);
- err = kvm_set_dr(&svm->vcpu, dr, val);
+ val = kvm_register_read(vcpu, reg);
+ err = kvm_set_dr(vcpu, dr, val);
} else {
- kvm_get_dr(&svm->vcpu, dr, &val);
- kvm_register_write(&svm->vcpu, reg, val);
+ kvm_get_dr(vcpu, dr, &val);
+ kvm_register_write(vcpu, reg, val);
}
- return kvm_complete_insn_gp(&svm->vcpu, err);
+ return kvm_complete_insn_gp(vcpu, err);
}
-static int cr8_write_interception(struct vcpu_svm *svm)
+static int cr8_write_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_run *kvm_run = svm->vcpu.run;
int r;
- u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
+ u8 cr8_prev = kvm_get_cr8(vcpu);
/* instruction emulation calls kvm_set_cr8() */
- r = cr_interception(svm);
- if (lapic_in_kernel(&svm->vcpu))
+ r = cr_interception(vcpu);
+ if (lapic_in_kernel(vcpu))
return r;
- if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
+ if (cr8_prev <= kvm_get_cr8(vcpu))
return r;
- kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+ vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
return 0;
}
-static int efer_trap(struct vcpu_svm *svm)
+static int efer_trap(struct kvm_vcpu *vcpu)
{
struct msr_data msr_info;
int ret;
*/
msr_info.host_initiated = false;
msr_info.index = MSR_EFER;
- msr_info.data = svm->vmcb->control.exit_info_1 & ~EFER_SVME;
- ret = kvm_set_msr_common(&svm->vcpu, &msr_info);
+ msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
+ ret = kvm_set_msr_common(vcpu, &msr_info);
- return kvm_complete_insn_gp(&svm->vcpu, ret);
+ return kvm_complete_insn_gp(vcpu, ret);
}
static int svm_get_msr_feature(struct kvm_msr_entry *msr)
switch (msr_info->index) {
case MSR_STAR:
- msr_info->data = svm->vmcb->save.star;
+ msr_info->data = svm->vmcb01.ptr->save.star;
break;
#ifdef CONFIG_X86_64
case MSR_LSTAR:
- msr_info->data = svm->vmcb->save.lstar;
+ msr_info->data = svm->vmcb01.ptr->save.lstar;
break;
case MSR_CSTAR:
- msr_info->data = svm->vmcb->save.cstar;
+ msr_info->data = svm->vmcb01.ptr->save.cstar;
break;
case MSR_KERNEL_GS_BASE:
- msr_info->data = svm->vmcb->save.kernel_gs_base;
+ msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
break;
case MSR_SYSCALL_MASK:
- msr_info->data = svm->vmcb->save.sfmask;
+ msr_info->data = svm->vmcb01.ptr->save.sfmask;
break;
#endif
case MSR_IA32_SYSENTER_CS:
- msr_info->data = svm->vmcb->save.sysenter_cs;
+ msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
break;
case MSR_IA32_SYSENTER_EIP:
- msr_info->data = svm->sysenter_eip;
+ msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
+ if (guest_cpuid_is_intel(vcpu))
+ msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
break;
case MSR_IA32_SYSENTER_ESP:
- msr_info->data = svm->sysenter_esp;
+ msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
+ if (guest_cpuid_is_intel(vcpu))
+ msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
break;
case MSR_TSC_AUX:
if (!boot_cpu_has(X86_FEATURE_RDTSCP))
!guest_has_spec_ctrl_msr(vcpu))
return 1;
- msr_info->data = svm->spec_ctrl;
+ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ msr_info->data = svm->vmcb->save.spec_ctrl;
+ else
+ msr_info->data = svm->spec_ctrl;
break;
case MSR_AMD64_VIRT_SPEC_CTRL:
if (!msr_info->host_initiated &&
static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (!sev_es_guest(svm->vcpu.kvm) || !err)
- return kvm_complete_insn_gp(&svm->vcpu, err);
+ if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->ghcb))
+ return kvm_complete_insn_gp(vcpu, err);
ghcb_set_sw_exit_info_1(svm->ghcb, 1);
ghcb_set_sw_exit_info_2(svm->ghcb,
return 1;
}
-static int rdmsr_interception(struct vcpu_svm *svm)
-{
- return kvm_emulate_rdmsr(&svm->vcpu);
-}
-
static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
{
struct vcpu_svm *svm = to_svm(vcpu);
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
return 1;
vcpu->arch.pat = data;
- svm->vmcb->save.g_pat = data;
+ svm->vmcb01.ptr->save.g_pat = data;
+ if (is_guest_mode(vcpu))
+ nested_vmcb02_compute_g_pat(svm);
vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
break;
case MSR_IA32_SPEC_CTRL:
if (kvm_spec_ctrl_test_value(data))
return 1;
- svm->spec_ctrl = data;
+ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ svm->vmcb->save.spec_ctrl = data;
+ else
+ svm->spec_ctrl = data;
if (!data)
break;
svm->virt_spec_ctrl = data;
break;
case MSR_STAR:
- svm->vmcb->save.star = data;
+ svm->vmcb01.ptr->save.star = data;
break;
#ifdef CONFIG_X86_64
case MSR_LSTAR:
- svm->vmcb->save.lstar = data;
+ svm->vmcb01.ptr->save.lstar = data;
break;
case MSR_CSTAR:
- svm->vmcb->save.cstar = data;
+ svm->vmcb01.ptr->save.cstar = data;
break;
case MSR_KERNEL_GS_BASE:
- svm->vmcb->save.kernel_gs_base = data;
+ svm->vmcb01.ptr->save.kernel_gs_base = data;
break;
case MSR_SYSCALL_MASK:
- svm->vmcb->save.sfmask = data;
+ svm->vmcb01.ptr->save.sfmask = data;
break;
#endif
case MSR_IA32_SYSENTER_CS:
- svm->vmcb->save.sysenter_cs = data;
+ svm->vmcb01.ptr->save.sysenter_cs = data;
break;
case MSR_IA32_SYSENTER_EIP:
- svm->sysenter_eip = data;
- svm->vmcb->save.sysenter_eip = data;
+ svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
+ /*
+ * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
+ * when we spoof an Intel vendor ID (for cross vendor migration).
+ * In this case we use this intercept to track the high
+ * 32 bit part of these msrs to support Intel's
+ * implementation of SYSENTER/SYSEXIT.
+ */
+ svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
break;
case MSR_IA32_SYSENTER_ESP:
- svm->sysenter_esp = data;
- svm->vmcb->save.sysenter_esp = data;
+ svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
+ svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
break;
case MSR_TSC_AUX:
if (!boot_cpu_has(X86_FEATURE_RDTSCP))
return 0;
}
-static int wrmsr_interception(struct vcpu_svm *svm)
-{
- return kvm_emulate_wrmsr(&svm->vcpu);
-}
-
-static int msr_interception(struct vcpu_svm *svm)
+static int msr_interception(struct kvm_vcpu *vcpu)
{
- if (svm->vmcb->control.exit_info_1)
- return wrmsr_interception(svm);
+ if (to_svm(vcpu)->vmcb->control.exit_info_1)
+ return kvm_emulate_wrmsr(vcpu);
else
- return rdmsr_interception(svm);
+ return kvm_emulate_rdmsr(vcpu);
}
-static int interrupt_window_interception(struct vcpu_svm *svm)
+static int interrupt_window_interception(struct kvm_vcpu *vcpu)
{
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
- svm_clear_vintr(svm);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ svm_clear_vintr(to_svm(vcpu));
/*
* For AVIC, the only reason to end up here is ExtINTs.
* In this case AVIC was temporarily disabled for
* requesting the IRQ window and we have to re-enable it.
*/
- svm_toggle_avic_for_irq_window(&svm->vcpu, true);
+ svm_toggle_avic_for_irq_window(vcpu, true);
- ++svm->vcpu.stat.irq_window_exits;
+ ++vcpu->stat.irq_window_exits;
return 1;
}
-static int pause_interception(struct vcpu_svm *svm)
+static int pause_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
bool in_kernel;
/*
* vcpu->arch.preempted_in_kernel can never be true. Just
* set in_kernel to false as well.
*/
- in_kernel = !sev_es_guest(svm->vcpu.kvm) && svm_get_cpl(vcpu) == 0;
+ in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
if (!kvm_pause_in_guest(vcpu->kvm))
grow_ple_window(vcpu);
kvm_vcpu_on_spin(vcpu, in_kernel);
- return 1;
-}
-
-static int nop_interception(struct vcpu_svm *svm)
-{
- return kvm_skip_emulated_instruction(&(svm->vcpu));
+ return kvm_skip_emulated_instruction(vcpu);
}
-static int monitor_interception(struct vcpu_svm *svm)
+static int invpcid_interception(struct kvm_vcpu *vcpu)
{
- printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
- return nop_interception(svm);
-}
-
-static int mwait_interception(struct vcpu_svm *svm)
-{
- printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
- return nop_interception(svm);
-}
-
-static int invpcid_interception(struct vcpu_svm *svm)
-{
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vcpu_svm *svm = to_svm(vcpu);
unsigned long type;
gva_t gva;
return kvm_handle_invpcid(vcpu, type, gva);
}
-static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
+static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_READ_CR0] = cr_interception,
[SVM_EXIT_READ_CR3] = cr_interception,
[SVM_EXIT_READ_CR4] = cr_interception,
[SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception,
[SVM_EXIT_INTR] = intr_interception,
[SVM_EXIT_NMI] = nmi_interception,
- [SVM_EXIT_SMI] = nop_on_interception,
- [SVM_EXIT_INIT] = nop_on_interception,
+ [SVM_EXIT_SMI] = kvm_emulate_as_nop,
+ [SVM_EXIT_INIT] = kvm_emulate_as_nop,
[SVM_EXIT_VINTR] = interrupt_window_interception,
- [SVM_EXIT_RDPMC] = rdpmc_interception,
- [SVM_EXIT_CPUID] = cpuid_interception,
+ [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc,
+ [SVM_EXIT_CPUID] = kvm_emulate_cpuid,
[SVM_EXIT_IRET] = iret_interception,
- [SVM_EXIT_INVD] = invd_interception,
+ [SVM_EXIT_INVD] = kvm_emulate_invd,
[SVM_EXIT_PAUSE] = pause_interception,
- [SVM_EXIT_HLT] = halt_interception,
+ [SVM_EXIT_HLT] = kvm_emulate_halt,
[SVM_EXIT_INVLPG] = invlpg_interception,
[SVM_EXIT_INVLPGA] = invlpga_interception,
[SVM_EXIT_IOIO] = io_interception,
[SVM_EXIT_TASK_SWITCH] = task_switch_interception,
[SVM_EXIT_SHUTDOWN] = shutdown_interception,
[SVM_EXIT_VMRUN] = vmrun_interception,
- [SVM_EXIT_VMMCALL] = vmmcall_interception,
+ [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall,
[SVM_EXIT_VMLOAD] = vmload_interception,
[SVM_EXIT_VMSAVE] = vmsave_interception,
[SVM_EXIT_STGI] = stgi_interception,
[SVM_EXIT_CLGI] = clgi_interception,
[SVM_EXIT_SKINIT] = skinit_interception,
- [SVM_EXIT_WBINVD] = wbinvd_interception,
- [SVM_EXIT_MONITOR] = monitor_interception,
- [SVM_EXIT_MWAIT] = mwait_interception,
- [SVM_EXIT_XSETBV] = xsetbv_interception,
- [SVM_EXIT_RDPRU] = rdpru_interception,
+ [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd,
+ [SVM_EXIT_MONITOR] = kvm_emulate_monitor,
+ [SVM_EXIT_MWAIT] = kvm_emulate_mwait,
+ [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv,
+ [SVM_EXIT_RDPRU] = kvm_handle_invalid_op,
[SVM_EXIT_EFER_WRITE_TRAP] = efer_trap,
[SVM_EXIT_CR0_WRITE_TRAP] = cr_trap,
[SVM_EXIT_CR4_WRITE_TRAP] = cr_trap,
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
+ struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
if (!dump_invalid_vmcb) {
pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
save->ds.limit, save->ds.base);
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"fs:",
- save->fs.selector, save->fs.attrib,
- save->fs.limit, save->fs.base);
+ save01->fs.selector, save01->fs.attrib,
+ save01->fs.limit, save01->fs.base);
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"gs:",
- save->gs.selector, save->gs.attrib,
- save->gs.limit, save->gs.base);
+ save01->gs.selector, save01->gs.attrib,
+ save01->gs.limit, save01->gs.base);
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"gdtr:",
save->gdtr.selector, save->gdtr.attrib,
save->gdtr.limit, save->gdtr.base);
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"ldtr:",
- save->ldtr.selector, save->ldtr.attrib,
- save->ldtr.limit, save->ldtr.base);
+ save01->ldtr.selector, save01->ldtr.attrib,
+ save01->ldtr.limit, save01->ldtr.base);
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"idtr:",
save->idtr.selector, save->idtr.attrib,
save->idtr.limit, save->idtr.base);
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"tr:",
- save->tr.selector, save->tr.attrib,
- save->tr.limit, save->tr.base);
+ save01->tr.selector, save01->tr.attrib,
+ save01->tr.limit, save01->tr.base);
pr_err("cpl: %d efer: %016llx\n",
save->cpl, save->efer);
pr_err("%-15s %016llx %-13s %016llx\n",
pr_err("%-15s %016llx %-13s %016llx\n",
"rsp:", save->rsp, "rax:", save->rax);
pr_err("%-15s %016llx %-13s %016llx\n",
- "star:", save->star, "lstar:", save->lstar);
+ "star:", save01->star, "lstar:", save01->lstar);
pr_err("%-15s %016llx %-13s %016llx\n",
- "cstar:", save->cstar, "sfmask:", save->sfmask);
+ "cstar:", save01->cstar, "sfmask:", save01->sfmask);
pr_err("%-15s %016llx %-13s %016llx\n",
- "kernel_gs_base:", save->kernel_gs_base,
- "sysenter_cs:", save->sysenter_cs);
+ "kernel_gs_base:", save01->kernel_gs_base,
+ "sysenter_cs:", save01->sysenter_cs);
pr_err("%-15s %016llx %-13s %016llx\n",
- "sysenter_esp:", save->sysenter_esp,
- "sysenter_eip:", save->sysenter_eip);
+ "sysenter_esp:", save01->sysenter_esp,
+ "sysenter_eip:", save01->sysenter_eip);
pr_err("%-15s %016llx %-13s %016llx\n",
"gpat:", save->g_pat, "dbgctl:", save->dbgctl);
pr_err("%-15s %016llx %-13s %016llx\n",
return -EINVAL;
}
-int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code)
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
{
- if (svm_handle_invalid_exit(&svm->vcpu, exit_code))
+ if (svm_handle_invalid_exit(vcpu, exit_code))
return 0;
#ifdef CONFIG_RETPOLINE
if (exit_code == SVM_EXIT_MSR)
- return msr_interception(svm);
+ return msr_interception(vcpu);
else if (exit_code == SVM_EXIT_VINTR)
- return interrupt_window_interception(svm);
+ return interrupt_window_interception(vcpu);
else if (exit_code == SVM_EXIT_INTR)
- return intr_interception(svm);
+ return intr_interception(vcpu);
else if (exit_code == SVM_EXIT_HLT)
- return halt_interception(svm);
+ return kvm_emulate_halt(vcpu);
else if (exit_code == SVM_EXIT_NPF)
- return npf_interception(svm);
+ return npf_interception(vcpu);
#endif
- return svm_exit_handlers[exit_code](svm);
+ return svm_exit_handlers[exit_code](vcpu);
}
static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
if (exit_fastpath != EXIT_FASTPATH_NONE)
return 1;
- return svm_invoke_exit_handler(svm, exit_code);
+ return svm_invoke_exit_handler(vcpu, exit_code);
}
static void reload_tss(struct kvm_vcpu *vcpu)
load_TR_desc();
}
-static void pre_svm_run(struct vcpu_svm *svm)
+static void pre_svm_run(struct kvm_vcpu *vcpu)
{
- struct svm_cpu_data *sd = per_cpu(svm_data, svm->vcpu.cpu);
+ struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If the previous vmrun of the vmcb occurred on a different physical
+ * cpu, then mark the vmcb dirty and assign a new asid. Hardware's
+ * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
+ */
+ if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
+ svm->current_vmcb->asid_generation = 0;
+ vmcb_mark_all_dirty(svm->vmcb);
+ svm->current_vmcb->cpu = vcpu->cpu;
+ }
- if (sev_guest(svm->vcpu.kvm))
- return pre_sev_run(svm, svm->vcpu.cpu);
+ if (sev_guest(vcpu->kvm))
+ return pre_sev_run(svm, vcpu->cpu);
/* FIXME: handle wraparound of asid_generation */
- if (svm->asid_generation != sd->asid_generation)
+ if (svm->current_vmcb->asid_generation != sd->asid_generation)
new_asid(svm, sd);
}
svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
vcpu->arch.hflags |= HF_NMI_MASK;
- if (!sev_es_guest(svm->vcpu.kvm))
+ if (!sev_es_guest(vcpu->kvm))
svm_set_intercept(svm, INTERCEPT_IRET);
++vcpu->stat.nmi_injections;
}
return false;
ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
- (svm->vcpu.arch.hflags & HF_NMI_MASK);
+ (vcpu->arch.hflags & HF_NMI_MASK);
return ret;
}
static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
+ return !!(vcpu->arch.hflags & HF_NMI_MASK);
}
static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
struct vcpu_svm *svm = to_svm(vcpu);
if (masked) {
- svm->vcpu.arch.hflags |= HF_NMI_MASK;
- if (!sev_es_guest(svm->vcpu.kvm))
+ vcpu->arch.hflags |= HF_NMI_MASK;
+ if (!sev_es_guest(vcpu->kvm))
svm_set_intercept(svm, INTERCEPT_IRET);
} else {
- svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
- if (!sev_es_guest(svm->vcpu.kvm))
+ vcpu->arch.hflags &= ~HF_NMI_MASK;
+ if (!sev_es_guest(vcpu->kvm))
svm_clr_intercept(svm, INTERCEPT_IRET);
}
}
if (!gif_set(svm))
return true;
- if (sev_es_guest(svm->vcpu.kvm)) {
+ if (sev_es_guest(vcpu->kvm)) {
/*
* SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask
* bit to determine the state of the IF flag.
} else if (is_guest_mode(vcpu)) {
/* As long as interrupts are being delivered... */
if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
- ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF)
+ ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
: !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
return true;
{
struct vcpu_svm *svm = to_svm(vcpu);
- if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
- == HF_NMI_MASK)
+ if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
return; /* IRET will cause a vm exit */
if (!gif_set(svm)) {
if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
else
- svm->asid_generation--;
+ svm->current_vmcb->asid_generation--;
}
static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
}
-static void svm_complete_interrupts(struct vcpu_svm *svm)
+static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
u8 vector;
int type;
u32 exitintinfo = svm->vmcb->control.exit_int_info;
* If we've made progress since setting HF_IRET_MASK, we've
* executed an IRET and can allow NMI injection.
*/
- if ((svm->vcpu.arch.hflags & HF_IRET_MASK) &&
- (sev_es_guest(svm->vcpu.kvm) ||
- kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip)) {
- svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ if ((vcpu->arch.hflags & HF_IRET_MASK) &&
+ (sev_es_guest(vcpu->kvm) ||
+ kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
+ vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
- svm->vcpu.arch.nmi_injected = false;
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
if (!(exitintinfo & SVM_EXITINTINFO_VALID))
return;
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
switch (type) {
case SVM_EXITINTINFO_TYPE_NMI:
- svm->vcpu.arch.nmi_injected = true;
+ vcpu->arch.nmi_injected = true;
break;
case SVM_EXITINTINFO_TYPE_EXEPT:
/*
*/
if (kvm_exception_is_soft(vector)) {
if (vector == BP_VECTOR && int3_injected &&
- kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
- kvm_rip_write(&svm->vcpu,
- kvm_rip_read(&svm->vcpu) -
- int3_injected);
+ kvm_is_linear_rip(vcpu, svm->int3_rip))
+ kvm_rip_write(vcpu,
+ kvm_rip_read(vcpu) - int3_injected);
break;
}
if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
u32 err = svm->vmcb->control.exit_int_info_err;
- kvm_requeue_exception_e(&svm->vcpu, vector, err);
+ kvm_requeue_exception_e(vcpu, vector, err);
} else
- kvm_requeue_exception(&svm->vcpu, vector);
+ kvm_requeue_exception(vcpu, vector);
break;
case SVM_EXITINTINFO_TYPE_INTR:
- kvm_queue_interrupt(&svm->vcpu, vector, false);
+ kvm_queue_interrupt(vcpu, vector, false);
break;
default:
break;
control->exit_int_info = control->event_inj;
control->exit_int_info_err = control->event_inj_err;
control->event_inj = 0;
- svm_complete_interrupts(svm);
+ svm_complete_interrupts(vcpu);
}
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
return EXIT_FASTPATH_NONE;
}
-static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
- struct vcpu_svm *svm)
+static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long vmcb_pa = svm->current_vmcb->pa;
+
/*
* VMENTER enables interrupts (host state), but the kernel state is
* interrupts disabled when this is invoked. Also tell RCU about
guest_enter_irqoff();
lockdep_hardirqs_on(CALLER_ADDR0);
- if (sev_es_guest(svm->vcpu.kvm)) {
- __svm_sev_es_vcpu_run(svm->vmcb_pa);
+ if (sev_es_guest(vcpu->kvm)) {
+ __svm_sev_es_vcpu_run(vmcb_pa);
} else {
struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
- __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
+ /*
+ * Use a single vmcb (vmcb01 because it's always valid) for
+ * context switching guest state via VMLOAD/VMSAVE, that way
+ * the state doesn't need to be copied between vmcb01 and
+ * vmcb02 when switching vmcbs for nested virtualization.
+ */
+ vmload(svm->vmcb01.pa);
+ __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs);
+ vmsave(svm->vmcb01.pa);
vmload(__sme_page_pa(sd->save_area));
}
smp_send_reschedule(vcpu->cpu);
}
- pre_svm_run(svm);
+ pre_svm_run(vcpu);
sync_lapic_to_cr8(vcpu);
* Run with all-zero DR6 unless needed, so that we can get the exact cause
* of a #DB.
*/
- if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
svm_set_dr6(svm, vcpu->arch.dr6);
else
svm_set_dr6(svm, DR6_ACTIVE_LOW);
* is no need to worry about the conditional branch over the wrmsr
* being speculatively taken.
*/
- x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
+ if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
- svm_vcpu_enter_exit(vcpu, svm);
+ svm_vcpu_enter_exit(vcpu);
/*
* We do not use IBRS in the kernel. If this vCPU has used the
* If the L02 MSR bitmap does not intercept the MSR, then we need to
* save it.
*/
- if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+ if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
+ unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
- if (!sev_es_guest(svm->vcpu.kvm))
+ if (!sev_es_guest(vcpu->kvm))
reload_tss(vcpu);
- x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
+ if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
- if (!sev_es_guest(svm->vcpu.kvm)) {
+ if (!sev_es_guest(vcpu->kvm)) {
vcpu->arch.cr2 = svm->vmcb->save.cr2;
vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
}
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
- kvm_before_interrupt(&svm->vcpu);
+ kvm_before_interrupt(vcpu);
kvm_load_host_xsave_state(vcpu);
stgi();
/* Any pending NMI will happen here */
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
- kvm_after_interrupt(&svm->vcpu);
+ kvm_after_interrupt(vcpu);
sync_cr8_to_lapic(vcpu);
svm->next_rip = 0;
- if (is_guest_mode(&svm->vcpu)) {
- sync_nested_vmcb_control(svm);
+ if (is_guest_mode(vcpu)) {
+ nested_sync_control_from_vmcb02(svm);
svm->nested.nested_run_pending = 0;
}
/* if exit due to PF check for async PF */
if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
- svm->vcpu.arch.apf.host_apf_flags =
+ vcpu->arch.apf.host_apf_flags =
kvm_read_and_reset_apf_flags();
if (npt_enabled) {
*/
if (unlikely(svm->vmcb->control.exit_code ==
SVM_EXIT_EXCP_BASE + MC_VECTOR))
- svm_handle_mce(svm);
+ svm_handle_mce(vcpu);
- svm_complete_interrupts(svm);
+ svm_complete_interrupts(vcpu);
if (is_guest_mode(vcpu))
return EXIT_FASTPATH_NONE;
return svm_exit_handlers_fastpath(vcpu);
}
-static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root,
+static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
int root_level)
{
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long cr3;
- cr3 = __sme_set(root);
if (npt_enabled) {
- svm->vmcb->control.nested_cr3 = cr3;
+ svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
/* Loading L2's CR3 is handled by enter_svm_guest_mode. */
if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
return;
cr3 = vcpu->arch.cr3;
+ } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
+ cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
+ } else {
+ /* PCID in the guest should be impossible with a 32-bit MMU. */
+ WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
+ cr3 = root_hpa;
}
svm->vmcb->save.cr3 = cr3;
/* Update nrips enabled cache */
svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
- guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
+ guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
/* Check again if INVPCID interception if required */
svm_check_invpcid(svm);
vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
}
- if (!kvm_vcpu_apicv_active(vcpu))
- return;
+ if (kvm_vcpu_apicv_active(vcpu)) {
+ /*
+ * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
+ * is exposed to the guest, disable AVIC.
+ */
+ if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
+ kvm_request_apicv_update(vcpu->kvm, false,
+ APICV_INHIBIT_REASON_X2APIC);
- /*
- * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
- * is exposed to the guest, disable AVIC.
- */
- if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
- kvm_request_apicv_update(vcpu->kvm, false,
- APICV_INHIBIT_REASON_X2APIC);
+ /*
+ * Currently, AVIC does not work with nested virtualization.
+ * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
+ */
+ if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+ kvm_request_apicv_update(vcpu->kvm, false,
+ APICV_INHIBIT_REASON_NESTED);
+ }
- /*
- * Currently, AVIC does not work with nested virtualization.
- * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
- */
- if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
- kvm_request_apicv_update(vcpu->kvm, false,
- APICV_INHIBIT_REASON_NESTED);
+ if (guest_cpuid_is_intel(vcpu)) {
+ /*
+ * We must intercept SYSENTER_EIP and SYSENTER_ESP
+ * accesses because the processor only stores 32 bits.
+ * For the same reason we cannot use virtual VMLOAD/VMSAVE.
+ */
+ svm_set_intercept(svm, INTERCEPT_VMLOAD);
+ svm_set_intercept(svm, INTERCEPT_VMSAVE);
+ svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
+ } else {
+ /*
+ * If hardware supports Virtual VMLOAD VMSAVE then enable it
+ * in VMCB and clear intercepts to avoid #VMEXIT.
+ */
+ if (vls) {
+ svm_clr_intercept(svm, INTERCEPT_VMLOAD);
+ svm_clr_intercept(svm, INTERCEPT_VMSAVE);
+ svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ }
+ /* No need to intercept these MSRs */
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
+ }
}
static bool svm_has_wbinvd_exit(void)
if (!(saved_efer & EFER_SVME))
return 1;
- if (kvm_vcpu_map(&svm->vcpu,
+ if (kvm_vcpu_map(vcpu,
gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
return 1;
if (svm_allocate_nested(svm))
return 1;
- ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva);
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, map.hva);
+ kvm_vcpu_unmap(vcpu, &map, true);
}
}
.mem_enc_reg_region = svm_register_enc_region,
.mem_enc_unreg_region = svm_unregister_enc_region,
+ .vm_copy_enc_context_from = svm_vm_copy_asid_from,
+
.can_emulate_instruction = svm_can_emulate_instruction,
.apic_init_signal_blocked = svm_apic_init_signal_blocked,
};
#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-#define MAX_DIRECT_ACCESS_MSRS 18
+#define IOPM_SIZE PAGE_SIZE * 3
+#define MSRPM_SIZE PAGE_SIZE * 2
+
+#define MAX_DIRECT_ACCESS_MSRS 20
#define MSRPM_OFFSETS 16
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
unsigned long pages_locked; /* Number of pages locked */
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
+ struct kvm *enc_context_owner; /* Owner of copied encryption context */
struct misc_cg *misc_cg; /* For misc cgroup accounting */
};
struct kvm_vcpu;
+struct kvm_vmcb_info {
+ struct vmcb *ptr;
+ unsigned long pa;
+ int cpu;
+ uint64_t asid_generation;
+};
+
struct svm_nested_state {
- struct vmcb *hsave;
+ struct kvm_vmcb_info vmcb02;
u64 hsave_msr;
u64 vm_cr_msr;
u64 vmcb12_gpa;
+ u64 last_vmcb12_gpa;
/* These are the merged vectors */
u32 *msrpm;
struct vcpu_svm {
struct kvm_vcpu vcpu;
+ /* vmcb always points at current_vmcb->ptr, it's purely a shorthand. */
struct vmcb *vmcb;
- unsigned long vmcb_pa;
+ struct kvm_vmcb_info vmcb01;
+ struct kvm_vmcb_info *current_vmcb;
struct svm_cpu_data *svm_data;
u32 asid;
- uint64_t asid_generation;
- uint64_t sysenter_esp;
- uint64_t sysenter_eip;
+ u32 sysenter_esp_hi;
+ u32 sysenter_eip_hi;
uint64_t tsc_aux;
u64 msr_decfg;
vmcb->control.clean &= ~(1 << bit);
}
-static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
+static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit)
{
- return container_of(vcpu, struct vcpu_svm, vcpu);
+ return !test_bit(bit, (unsigned long *)&vmcb->control.clean);
}
-static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
+static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
{
- if (is_guest_mode(&svm->vcpu))
- return svm->nested.hsave;
- else
- return svm->vmcb;
+ return container_of(vcpu, struct vcpu_svm, vcpu);
}
static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
static inline void set_dr_intercepts(struct vcpu_svm *svm)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
if (!sev_es_guest(svm->vcpu.kvm)) {
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
static inline void clr_dr_intercepts(struct vcpu_svm *svm)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
vmcb->control.intercepts[INTERCEPT_DR] = 0;
static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
WARN_ON_ONCE(bit >= 32);
vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
WARN_ON_ONCE(bit >= 32);
vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
vmcb_set_intercept(&vmcb->control, bit);
static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
vmcb_clr_intercept(&vmcb->control, bit);
bool svm_nmi_blocked(struct kvm_vcpu *vcpu);
bool svm_interrupt_blocked(struct kvm_vcpu *vcpu);
void svm_set_gif(struct vcpu_svm *svm, bool value);
-int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code);
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code);
void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
int read, int write);
return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
}
-int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
- struct vmcb *nested_vmcb);
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, struct vmcb *vmcb12);
void svm_leave_nested(struct vcpu_svm *svm);
void svm_free_nested(struct vcpu_svm *svm);
int svm_allocate_nested(struct vcpu_svm *svm);
-int nested_svm_vmrun(struct vcpu_svm *svm);
+int nested_svm_vmrun(struct kvm_vcpu *vcpu);
void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb);
int nested_svm_vmexit(struct vcpu_svm *svm);
+
+static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code)
+{
+ svm->vmcb->control.exit_code = exit_code;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+ return nested_svm_vmexit(svm);
+}
+
int nested_svm_exit_handled(struct vcpu_svm *svm);
-int nested_svm_check_permissions(struct vcpu_svm *svm);
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu);
int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
int nested_svm_exit_special(struct vcpu_svm *svm);
-void sync_nested_vmcb_control(struct vcpu_svm *svm);
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm);
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm);
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb);
extern struct kvm_x86_nested_ops svm_nested_ops;
int avic_vm_init(struct kvm *kvm);
void avic_init_vmcb(struct vcpu_svm *svm);
void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate);
-int avic_incomplete_ipi_interception(struct vcpu_svm *svm);
-int avic_unaccelerated_access_interception(struct vcpu_svm *svm);
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
int avic_init_vcpu(struct vcpu_svm *svm);
void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
void avic_vcpu_put(struct kvm_vcpu *vcpu);
struct kvm_enc_region *range);
int svm_unregister_enc_region(struct kvm *kvm,
struct kvm_enc_region *range);
+int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd);
void pre_sev_run(struct vcpu_svm *svm, int cpu);
void __init sev_hardware_setup(void);
void sev_hardware_teardown(void);
void sev_free_vcpu(struct kvm_vcpu *vcpu);
-int sev_handle_vmgexit(struct vcpu_svm *svm);
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
void sev_es_init_vmcb(struct vcpu_svm *svm);
void sev_es_create_vcpu(struct vcpu_svm *svm);
/* Enter guest mode */
sti
-1: vmload %_ASM_AX
- jmp 3f
-2: cmpb $0, kvm_rebooting
- jne 3f
- ud2
- _ASM_EXTABLE(1b, 2b)
-3: vmrun %_ASM_AX
- jmp 5f
-4: cmpb $0, kvm_rebooting
- jne 5f
- ud2
- _ASM_EXTABLE(3b, 4b)
+1: vmrun %_ASM_AX
-5: vmsave %_ASM_AX
- jmp 7f
-6: cmpb $0, kvm_rebooting
- jne 7f
- ud2
- _ASM_EXTABLE(5b, 6b)
-7:
- cli
+2: cli
#ifdef CONFIG_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
#endif
pop %_ASM_BP
ret
+
+3: cmpb $0, kvm_rebooting
+ jne 2b
+ ud2
+
+ _ASM_EXTABLE(1b, 3b)
+
SYM_FUNC_END(__svm_vcpu_run)
/**
#endif
push %_ASM_BX
- /* Enter guest mode */
+ /* Move @vmcb to RAX. */
mov %_ASM_ARG1, %_ASM_AX
+
+ /* Enter guest mode */
sti
1: vmrun %_ASM_AX
- jmp 3f
-2: cmpb $0, kvm_rebooting
- jne 3f
- ud2
- _ASM_EXTABLE(1b, 2b)
-3: cli
+2: cli
#ifdef CONFIG_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
#endif
pop %_ASM_BP
ret
+
+3: cmpb $0, kvm_rebooting
+ jne 2b
+ ud2
+
+ _ASM_EXTABLE(1b, 3b)
+
SYM_FUNC_END(__svm_sev_es_vcpu_run)
#include "mmu.h"
#include "nested.h"
#include "pmu.h"
+#include "sgx.h"
#include "trace.h"
#include "vmx.h"
#include "x86.h"
static bool __read_mostly nested_early_check = 0;
module_param(nested_early_check, bool, S_IRUGO);
-#define CC(consistency_check) \
-({ \
- bool failed = (consistency_check); \
- if (failed) \
- trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
- failed; \
-})
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
/*
* Hyper-V requires all of these, so mark them as supported even though
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+ if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
+ vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
+
secondary_exec_controls_set(vmx, exec_control);
}
u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
enum nested_evmptrld_status evmptrld_status;
+ ++vcpu->stat.nested_run;
+
if (!nested_vmx_check_permission(vcpu))
return 1;
/*
* Process any exceptions that are not debug traps before MTF.
+ *
+ * Note that only a pending nested run can block a pending exception.
+ * Otherwise an injected NMI/interrupt should either be
+ * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO,
+ * while delivering the pending exception.
*/
+
if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
- if (block_nested_events)
+ if (vmx->nested.nested_run_pending)
return -EBUSY;
if (!nested_vmx_check_exception(vcpu, &exit_qual))
goto no_vmexit;
}
if (vcpu->arch.exception.pending) {
- if (block_nested_events)
+ if (vmx->nested.nested_run_pending)
return -EBUSY;
if (!nested_vmx_check_exception(vcpu, &exit_qual))
goto no_vmexit;
{
/* update exit information fields: */
vmcs12->vm_exit_reason = vm_exit_reason;
+ if (to_vmx(vcpu)->exit_reason.enclave_mode)
+ vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
vmcs12->exit_qualification = exit_qualification;
vmcs12->vm_exit_intr_info = exit_intr_info;
/* trying to cancel vmlaunch/vmresume is a bug */
WARN_ON_ONCE(vmx->nested.nested_run_pending);
+ /* Similarly, triple faults in L2 should never escape. */
+ WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
+
kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
/* Service the TLB flush request for L2 before switching to L1. */
vmx->fail = 0;
}
+static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
+{
+ nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
+}
+
/*
* Decode the memory-address operand of a vmx instruction, as recorded on an
* exit caused by such an instruction (run by a guest hypervisor).
if (!nested_vmx_check_eptp(vcpu, new_eptp))
return 1;
- kvm_mmu_unload(vcpu);
mmu->ept_ad = accessed_dirty;
mmu->mmu_role.base.ad_disabled = !accessed_dirty;
vmcs12->ept_pointer = new_eptp;
- /*
- * TODO: Check what's the correct approach in case
- * mmu reload fails. Currently, we just let the next
- * reload potentially fail
- */
- kvm_mmu_reload(vcpu);
+
+ kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
}
return 0;
return false;
}
+static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ u32 encls_leaf;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
+ !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
+ return false;
+
+ encls_leaf = kvm_rax_read(vcpu);
+ if (encls_leaf > 62)
+ encls_leaf = 63;
+ return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
+}
+
static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12, gpa_t bitmap)
{
case EXIT_REASON_VMFUNC:
/* VM functions are emulated through L2->L0 vmexits. */
return true;
- case EXIT_REASON_ENCLS:
- /* SGX is never exposed to L1 */
- return true;
default:
break;
}
case EXIT_REASON_TPAUSE:
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
+ case EXIT_REASON_ENCLS:
+ return nested_vmx_exit_handled_encls(vcpu, vmcs12);
default:
return true;
}
msrs->secondary_ctls_high |=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ if (enable_sgx)
+ msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
+
/* miscellaneous data */
rdmsr(MSR_IA32_VMX_MISC,
msrs->misc_low,
struct kvm_x86_nested_ops vmx_nested_ops = {
.check_events = vmx_check_nested_events,
.hv_timer_pending = nested_vmx_preemption_timer_pending,
+ .triple_fault = nested_vmx_triple_fault,
.get_state = vmx_get_nested_state,
.set_state = vmx_set_nested_state,
.get_nested_state_pages = vmx_get_nested_state_pages,
PIN_BASED_EXT_INTR_MASK;
}
+static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING);
+}
+
/*
* if fixed0[i] == 1: val[i] must be 1
* if fixed1[i] == 0: val[i] must be 0
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2021 Intel Corporation. */
+
+#include <asm/sgx.h>
+
+#include "cpuid.h"
+#include "kvm_cache_regs.h"
+#include "nested.h"
+#include "sgx.h"
+#include "vmx.h"
+#include "x86.h"
+
+bool __read_mostly enable_sgx = 1;
+module_param_named(sgx, enable_sgx, bool, 0444);
+
+/* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */
+static u64 sgx_pubkey_hash[4] __ro_after_init;
+
+/*
+ * ENCLS's memory operands use a fixed segment (DS) and a fixed
+ * address size based on the mode. Related prefixes are ignored.
+ */
+static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset,
+ int size, int alignment, gva_t *gva)
+{
+ struct kvm_segment s;
+ bool fault;
+
+ /* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */
+ *gva = offset;
+ if (!is_long_mode(vcpu)) {
+ vmx_get_segment(vcpu, &s, VCPU_SREG_DS);
+ *gva += s.base;
+ }
+
+ if (!IS_ALIGNED(*gva, alignment)) {
+ fault = true;
+ } else if (likely(is_long_mode(vcpu))) {
+ fault = is_noncanonical_address(*gva, vcpu);
+ } else {
+ *gva &= 0xffffffff;
+ fault = (s.unusable) ||
+ (s.type != 2 && s.type != 3) ||
+ (*gva > s.limit) ||
+ ((s.base != 0 || s.limit != 0xffffffff) &&
+ (((u64)*gva + size - 1) > s.limit + 1));
+ }
+ if (fault)
+ kvm_inject_gp(vcpu, 0);
+ return fault ? -EINVAL : 0;
+}
+
+static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr,
+ unsigned int size)
+{
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = addr;
+ vcpu->run->internal.data[1] = size;
+}
+
+static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data,
+ unsigned int size)
+{
+ if (__copy_from_user(data, (void __user *)hva, size)) {
+ sgx_handle_emulation_failure(vcpu, hva, size);
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write,
+ gpa_t *gpa)
+{
+ struct x86_exception ex;
+
+ if (write)
+ *gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, &ex);
+ else
+ *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex);
+
+ if (*gpa == UNMAPPED_GVA) {
+ kvm_inject_emulated_page_fault(vcpu, &ex);
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int sgx_gpa_to_hva(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long *hva)
+{
+ *hva = kvm_vcpu_gfn_to_hva(vcpu, PFN_DOWN(gpa));
+ if (kvm_is_error_hva(*hva)) {
+ sgx_handle_emulation_failure(vcpu, gpa, 1);
+ return -EFAULT;
+ }
+
+ *hva |= gpa & ~PAGE_MASK;
+
+ return 0;
+}
+
+static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr)
+{
+ struct x86_exception ex;
+
+ /*
+ * A non-EPCM #PF indicates a bad userspace HVA. This *should* check
+ * for PFEC.SGX and not assume any #PF on SGX2 originated in the EPC,
+ * but the error code isn't (yet) plumbed through the ENCLS helpers.
+ */
+ if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return 0;
+ }
+
+ /*
+ * If the guest thinks it's running on SGX2 hardware, inject an SGX
+ * #PF if the fault matches an EPCM fault signature (#GP on SGX1,
+ * #PF on SGX2). The assumption is that EPCM faults are much more
+ * likely than a bad userspace address.
+ */
+ if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) {
+ memset(&ex, 0, sizeof(ex));
+ ex.vector = PF_VECTOR;
+ ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK |
+ PFERR_SGX_MASK;
+ ex.address = gva;
+ ex.error_code_valid = true;
+ ex.nested_page_fault = false;
+ kvm_inject_page_fault(vcpu, &ex);
+ } else {
+ kvm_inject_gp(vcpu, 0);
+ }
+ return 1;
+}
+
+static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
+ struct sgx_pageinfo *pageinfo,
+ unsigned long secs_hva,
+ gva_t secs_gva)
+{
+ struct sgx_secs *contents = (struct sgx_secs *)pageinfo->contents;
+ struct kvm_cpuid_entry2 *sgx_12_0, *sgx_12_1;
+ u64 attributes, xfrm, size;
+ u32 miscselect;
+ u8 max_size_log2;
+ int trapnr, ret;
+
+ sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0);
+ sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1);
+ if (!sgx_12_0 || !sgx_12_1) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return 0;
+ }
+
+ miscselect = contents->miscselect;
+ attributes = contents->attributes;
+ xfrm = contents->xfrm;
+ size = contents->size;
+
+ /* Enforce restriction of access to the PROVISIONKEY. */
+ if (!vcpu->kvm->arch.sgx_provisioning_allowed &&
+ (attributes & SGX_ATTR_PROVISIONKEY)) {
+ if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY)
+ pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n");
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ /* Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. */
+ if ((u32)miscselect & ~sgx_12_0->ebx ||
+ (u32)attributes & ~sgx_12_1->eax ||
+ (u32)(attributes >> 32) & ~sgx_12_1->ebx ||
+ (u32)xfrm & ~sgx_12_1->ecx ||
+ (u32)(xfrm >> 32) & ~sgx_12_1->edx) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ /* Enforce CPUID restriction on max enclave size. */
+ max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 :
+ sgx_12_0->edx;
+ if (size >= BIT_ULL(max_size_log2))
+ kvm_inject_gp(vcpu, 0);
+
+ /*
+ * sgx_virt_ecreate() returns:
+ * 1) 0: ECREATE was successful
+ * 2) -EFAULT: ECREATE was run but faulted, and trapnr was set to the
+ * exception number.
+ * 3) -EINVAL: access_ok() on @secs_hva failed. This should never
+ * happen as KVM checks host addresses at memslot creation.
+ * sgx_virt_ecreate() has already warned in this case.
+ */
+ ret = sgx_virt_ecreate(pageinfo, (void __user *)secs_hva, &trapnr);
+ if (!ret)
+ return kvm_skip_emulated_instruction(vcpu);
+ if (ret == -EFAULT)
+ return sgx_inject_fault(vcpu, secs_gva, trapnr);
+
+ return ret;
+}
+
+static int handle_encls_ecreate(struct kvm_vcpu *vcpu)
+{
+ gva_t pageinfo_gva, secs_gva;
+ gva_t metadata_gva, contents_gva;
+ gpa_t metadata_gpa, contents_gpa, secs_gpa;
+ unsigned long metadata_hva, contents_hva, secs_hva;
+ struct sgx_pageinfo pageinfo;
+ struct sgx_secs *contents;
+ struct x86_exception ex;
+ int r;
+
+ if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 32, 32, &pageinfo_gva) ||
+ sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva))
+ return 1;
+
+ /*
+ * Copy the PAGEINFO to local memory, its pointers need to be
+ * translated, i.e. we need to do a deep copy/translate.
+ */
+ r = kvm_read_guest_virt(vcpu, pageinfo_gva, &pageinfo,
+ sizeof(pageinfo), &ex);
+ if (r == X86EMUL_PROPAGATE_FAULT) {
+ kvm_inject_emulated_page_fault(vcpu, &ex);
+ return 1;
+ } else if (r != X86EMUL_CONTINUE) {
+ sgx_handle_emulation_failure(vcpu, pageinfo_gva,
+ sizeof(pageinfo));
+ return 0;
+ }
+
+ if (sgx_get_encls_gva(vcpu, pageinfo.metadata, 64, 64, &metadata_gva) ||
+ sgx_get_encls_gva(vcpu, pageinfo.contents, 4096, 4096,
+ &contents_gva))
+ return 1;
+
+ /*
+ * Translate the SECINFO, SOURCE and SECS pointers from GVA to GPA.
+ * Resume the guest on failure to inject a #PF.
+ */
+ if (sgx_gva_to_gpa(vcpu, metadata_gva, false, &metadata_gpa) ||
+ sgx_gva_to_gpa(vcpu, contents_gva, false, &contents_gpa) ||
+ sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa))
+ return 1;
+
+ /*
+ * ...and then to HVA. The order of accesses isn't architectural, i.e.
+ * KVM doesn't have to fully process one address at a time. Exit to
+ * userspace if a GPA is invalid.
+ */
+ if (sgx_gpa_to_hva(vcpu, metadata_gpa, &metadata_hva) ||
+ sgx_gpa_to_hva(vcpu, contents_gpa, &contents_hva) ||
+ sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva))
+ return 0;
+
+ /*
+ * Copy contents into kernel memory to prevent TOCTOU attack. E.g. the
+ * guest could do ECREATE w/ SECS.SGX_ATTR_PROVISIONKEY=0, and
+ * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to
+ * enforce restriction of access to the PROVISIONKEY.
+ */
+ contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ if (!contents)
+ return -ENOMEM;
+
+ /* Exit to userspace if copying from a host userspace address fails. */
+ if (sgx_read_hva(vcpu, contents_hva, (void *)contents, PAGE_SIZE)) {
+ free_page((unsigned long)contents);
+ return 0;
+ }
+
+ pageinfo.metadata = metadata_hva;
+ pageinfo.contents = (u64)contents;
+
+ r = __handle_encls_ecreate(vcpu, &pageinfo, secs_hva, secs_gva);
+
+ free_page((unsigned long)contents);
+
+ return r;
+}
+
+static int handle_encls_einit(struct kvm_vcpu *vcpu)
+{
+ unsigned long sig_hva, secs_hva, token_hva, rflags;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ gva_t sig_gva, secs_gva, token_gva;
+ gpa_t sig_gpa, secs_gpa, token_gpa;
+ int ret, trapnr;
+
+ if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 1808, 4096, &sig_gva) ||
+ sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva) ||
+ sgx_get_encls_gva(vcpu, kvm_rdx_read(vcpu), 304, 512, &token_gva))
+ return 1;
+
+ /*
+ * Translate the SIGSTRUCT, SECS and TOKEN pointers from GVA to GPA.
+ * Resume the guest on failure to inject a #PF.
+ */
+ if (sgx_gva_to_gpa(vcpu, sig_gva, false, &sig_gpa) ||
+ sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa) ||
+ sgx_gva_to_gpa(vcpu, token_gva, false, &token_gpa))
+ return 1;
+
+ /*
+ * ...and then to HVA. The order of accesses isn't architectural, i.e.
+ * KVM doesn't have to fully process one address at a time. Exit to
+ * userspace if a GPA is invalid. Note, all structures are aligned and
+ * cannot split pages.
+ */
+ if (sgx_gpa_to_hva(vcpu, sig_gpa, &sig_hva) ||
+ sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva) ||
+ sgx_gpa_to_hva(vcpu, token_gpa, &token_hva))
+ return 0;
+
+ ret = sgx_virt_einit((void __user *)sig_hva, (void __user *)token_hva,
+ (void __user *)secs_hva,
+ vmx->msr_ia32_sgxlepubkeyhash, &trapnr);
+
+ if (ret == -EFAULT)
+ return sgx_inject_fault(vcpu, secs_gva, trapnr);
+
+ /*
+ * sgx_virt_einit() returns -EINVAL when access_ok() fails on @sig_hva,
+ * @token_hva or @secs_hva. This should never happen as KVM checks host
+ * addresses at memslot creation. sgx_virt_einit() has already warned
+ * in this case, so just return.
+ */
+ if (ret < 0)
+ return ret;
+
+ rflags = vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF |
+ X86_EFLAGS_AF | X86_EFLAGS_SF |
+ X86_EFLAGS_OF);
+ if (ret)
+ rflags |= X86_EFLAGS_ZF;
+ else
+ rflags &= ~X86_EFLAGS_ZF;
+ vmx_set_rflags(vcpu, rflags);
+
+ kvm_rax_write(vcpu, ret);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf)
+{
+ if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+ return false;
+
+ if (leaf >= ECREATE && leaf <= ETRACK)
+ return guest_cpuid_has(vcpu, X86_FEATURE_SGX1);
+
+ if (leaf >= EAUG && leaf <= EMODT)
+ return guest_cpuid_has(vcpu, X86_FEATURE_SGX2);
+
+ return false;
+}
+
+static inline bool sgx_enabled_in_guest_bios(struct kvm_vcpu *vcpu)
+{
+ const u64 bits = FEAT_CTL_SGX_ENABLED | FEAT_CTL_LOCKED;
+
+ return (to_vmx(vcpu)->msr_ia32_feature_control & bits) == bits;
+}
+
+int handle_encls(struct kvm_vcpu *vcpu)
+{
+ u32 leaf = (u32)kvm_rax_read(vcpu);
+
+ if (!encls_leaf_enabled_in_guest(vcpu, leaf)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ } else if (!sgx_enabled_in_guest_bios(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ } else {
+ if (leaf == ECREATE)
+ return handle_encls_ecreate(vcpu);
+ if (leaf == EINIT)
+ return handle_encls_einit(vcpu);
+ WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf);
+ vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+ vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS;
+ return 0;
+ }
+ return 1;
+}
+
+void setup_default_sgx_lepubkeyhash(void)
+{
+ /*
+ * Use Intel's default value for Skylake hardware if Launch Control is
+ * not supported, i.e. Intel's hash is hardcoded into silicon, or if
+ * Launch Control is supported and enabled, i.e. mimic the reset value
+ * and let the guest write the MSRs at will. If Launch Control is
+ * supported but disabled, then use the current MSR values as the hash
+ * MSRs exist but are read-only (locked and not writable).
+ */
+ if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) ||
+ rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) {
+ sgx_pubkey_hash[0] = 0xa6053e051270b7acULL;
+ sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL;
+ sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL;
+ sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL;
+ } else {
+ /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */
+ rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]);
+ rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]);
+ rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]);
+ }
+}
+
+void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash,
+ sizeof(sgx_pubkey_hash));
+}
+
+/*
+ * ECREATE must be intercepted to enforce MISCSELECT, ATTRIBUTES and XFRM
+ * restrictions if the guest's allowed-1 settings diverge from hardware.
+ */
+static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *guest_cpuid;
+ u32 eax, ebx, ecx, edx;
+
+ if (!vcpu->kvm->arch.sgx_provisioning_allowed)
+ return true;
+
+ guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0);
+ if (!guest_cpuid)
+ return true;
+
+ cpuid_count(0x12, 0, &eax, &ebx, &ecx, &edx);
+ if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx)
+ return true;
+
+ guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1);
+ if (!guest_cpuid)
+ return true;
+
+ cpuid_count(0x12, 1, &eax, &ebx, &ecx, &edx);
+ if (guest_cpuid->eax != eax || guest_cpuid->ebx != ebx ||
+ guest_cpuid->ecx != ecx || guest_cpuid->edx != edx)
+ return true;
+
+ return false;
+}
+
+void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ /*
+ * There is no software enable bit for SGX that is virtualized by
+ * hardware, e.g. there's no CR4.SGXE, so when SGX is disabled in the
+ * guest (either by the host or by the guest's BIOS) but enabled in the
+ * host, trap all ENCLS leafs and inject #UD/#GP as needed to emulate
+ * the expected system behavior for ENCLS.
+ */
+ u64 bitmap = -1ull;
+
+ /* Nothing to do if hardware doesn't support SGX */
+ if (!cpu_has_vmx_encls_vmexit())
+ return;
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) &&
+ sgx_enabled_in_guest_bios(vcpu)) {
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) {
+ bitmap &= ~GENMASK_ULL(ETRACK, ECREATE);
+ if (sgx_intercept_encls_ecreate(vcpu))
+ bitmap |= (1 << ECREATE);
+ }
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2))
+ bitmap &= ~GENMASK_ULL(EMODT, EAUG);
+
+ /*
+ * Trap and execute EINIT if launch control is enabled in the
+ * host using the guest's values for launch control MSRs, even
+ * if the guest's values are fixed to hardware default values.
+ * The MSRs are not loaded/saved on VM-Enter/VM-Exit as writing
+ * the MSRs is extraordinarily expensive.
+ */
+ if (boot_cpu_has(X86_FEATURE_SGX_LC))
+ bitmap |= (1 << EINIT);
+
+ if (!vmcs12 && is_guest_mode(vcpu))
+ vmcs12 = get_vmcs12(vcpu);
+ if (vmcs12 && nested_cpu_has_encls_exit(vmcs12))
+ bitmap |= vmcs12->encls_exiting_bitmap;
+ }
+ vmcs_write64(ENCLS_EXITING_BITMAP, bitmap);
+}
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_SGX_H
+#define __KVM_X86_SGX_H
+
+#include <linux/kvm_host.h>
+
+#include "capabilities.h"
+#include "vmx_ops.h"
+
+#ifdef CONFIG_X86_SGX_KVM
+extern bool __read_mostly enable_sgx;
+
+int handle_encls(struct kvm_vcpu *vcpu);
+
+void setup_default_sgx_lepubkeyhash(void);
+void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu);
+
+void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12);
+#else
+#define enable_sgx 0
+
+static inline void setup_default_sgx_lepubkeyhash(void) { }
+static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { }
+
+static inline void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ /* Nothing to do if hardware doesn't support SGX */
+ if (cpu_has_vmx_encls_vmexit())
+ vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+}
+#endif
+
+#endif /* __KVM_X86_SGX_H */
FIELD64(VMREAD_BITMAP, vmread_bitmap),
FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
+ FIELD64(ENCLS_EXITING_BITMAP, encls_exiting_bitmap),
FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
u64 vm_function_control;
u64 eptp_list_address;
u64 pml_address;
- u64 padding64[3]; /* room for future expansion */
+ u64 encls_exiting_bitmap;
+ u64 padding64[2]; /* room for future expansion */
/*
* To allow migration of L1 (complete with its L2 guests) between
* machines of different natural widths (32 or 64 bit), we cannot have
CHECK_OFFSET(vm_function_control, 296);
CHECK_OFFSET(eptp_list_address, 304);
CHECK_OFFSET(pml_address, 312);
+ CHECK_OFFSET(encls_exiting_bitmap, 320);
CHECK_OFFSET(cr0_guest_host_mask, 344);
CHECK_OFFSET(cr4_guest_host_mask, 352);
CHECK_OFFSET(cr0_read_shadow, 360);
#include "mmu.h"
#include "nested.h"
#include "pmu.h"
+#include "sgx.h"
#include "trace.h"
#include "vmcs.h"
#include "vmcs12.h"
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
-/* check_ept_pointer() should be under protection of ept_pointer_lock. */
-static void check_ept_pointer_match(struct kvm *kvm)
-{
- struct kvm_vcpu *vcpu;
- u64 tmp_eptp = INVALID_PAGE;
- int i;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!VALID_PAGE(tmp_eptp)) {
- tmp_eptp = to_vmx(vcpu)->ept_pointer;
- } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
- to_kvm_vmx(kvm)->ept_pointers_match
- = EPT_POINTERS_MISMATCH;
- return;
- }
- }
-
- to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
-}
-
static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
void *data)
{
range->pages);
}
-static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
- struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
+static inline int hv_remote_flush_root_ept(hpa_t root_ept,
+ struct kvm_tlb_range *range)
{
- u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
-
- /*
- * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
- * of the base of EPT PML4 table, strip off EPT configuration
- * information.
- */
if (range)
- return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
+ return hyperv_flush_guest_mapping_range(root_ept,
kvm_fill_hv_flush_list_func, (void *)range);
else
- return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
+ return hyperv_flush_guest_mapping(root_ept);
}
static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
struct kvm_tlb_range *range)
{
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
struct kvm_vcpu *vcpu;
- int ret = 0, i;
+ int ret = 0, i, nr_unique_valid_roots;
+ hpa_t root;
- spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+ spin_lock(&kvm_vmx->hv_root_ept_lock);
- if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
- check_ept_pointer_match(kvm);
+ if (!VALID_PAGE(kvm_vmx->hv_root_ept)) {
+ nr_unique_valid_roots = 0;
- if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
+ /*
+ * Flush all valid roots, and see if all vCPUs have converged
+ * on a common root, in which case future flushes can skip the
+ * loop and flush the common root.
+ */
kvm_for_each_vcpu(i, vcpu, kvm) {
- /* If ept_pointer is invalid pointer, bypass flush request. */
- if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
- ret |= __hv_remote_flush_tlb_with_range(
- kvm, vcpu, range);
+ root = to_vmx(vcpu)->hv_root_ept;
+ if (!VALID_PAGE(root) || root == kvm_vmx->hv_root_ept)
+ continue;
+
+ /*
+ * Set the tracked root to the first valid root. Keep
+ * this root for the entirety of the loop even if more
+ * roots are encountered as a low effort optimization
+ * to avoid flushing the same (first) root again.
+ */
+ if (++nr_unique_valid_roots == 1)
+ kvm_vmx->hv_root_ept = root;
+
+ if (!ret)
+ ret = hv_remote_flush_root_ept(root, range);
+
+ /*
+ * Stop processing roots if a failure occurred and
+ * multiple valid roots have already been detected.
+ */
+ if (ret && nr_unique_valid_roots > 1)
+ break;
}
+
+ /*
+ * The optimized flush of a single root can't be used if there
+ * are multiple valid roots (obviously).
+ */
+ if (nr_unique_valid_roots > 1)
+ kvm_vmx->hv_root_ept = INVALID_PAGE;
} else {
- ret = __hv_remote_flush_tlb_with_range(kvm,
- kvm_get_vcpu(kvm, 0), range);
+ ret = hv_remote_flush_root_ept(kvm_vmx->hv_root_ept, range);
}
- spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+ spin_unlock(&kvm_vmx->hv_root_ept_lock);
return ret;
}
static int hv_remote_flush_tlb(struct kvm *kvm)
* evmcs in singe VM shares same assist page.
*/
if (!*p_hv_pa_pg)
- *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
if (!*p_hv_pa_pg)
return -ENOMEM;
#endif /* IS_ENABLED(CONFIG_HYPERV) */
+static void hv_track_root_ept(struct kvm_vcpu *vcpu, hpa_t root_ept)
+{
+#if IS_ENABLED(CONFIG_HYPERV)
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
+
+ if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
+ spin_lock(&kvm_vmx->hv_root_ept_lock);
+ to_vmx(vcpu)->hv_root_ept = root_ept;
+ if (root_ept != kvm_vmx->hv_root_ept)
+ kvm_vmx->hv_root_ept = INVALID_PAGE;
+ spin_unlock(&kvm_vmx->hv_root_ept_lock);
+ }
+#endif
+}
+
/*
* Comment's format: document - errata name - stepping - processor name.
* Refer from
static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
{
+ /*
+ * Emulation of instructions in SGX enclaves is impossible as RIP does
+ * not point tthe failing instruction, and even if it did, the code
+ * stream is inaccessible. Inject #UD instead of exiting to userspace
+ * so that guest userspace can't DoS the guest simply by triggering
+ * emulation (enclaves are CPL3 only).
+ */
+ if (to_vmx(vcpu)->exit_reason.enclave_mode) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return false;
+ }
return true;
}
static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
+ union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
unsigned long rip, orig_rip;
+ u32 instr_len;
/*
* Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
* i.e. we end up advancing IP with some random value.
*/
if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
- to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+ exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+ instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+
+ /*
+ * Emulating an enclave's instructions isn't supported as KVM
+ * cannot access the enclave's memory or its true RIP, e.g. the
+ * vmcs.GUEST_RIP points at the exit point of the enclave, not
+ * the RIP that actually triggered the VM-Exit. But, because
+ * most instructions that cause VM-Exit will #UD in an enclave,
+ * most instruction-based VM-Exits simply do not occur.
+ *
+ * There are a few exceptions, notably the debug instructions
+ * INT1ICEBRK and INT3, as they are allowed in debug enclaves
+ * and generate #DB/#BP as expected, which KVM might intercept.
+ * But again, the CPU does the dirty work and saves an instr
+ * length of zero so VMMs don't shoot themselves in the foot.
+ * WARN if KVM tries to skip a non-zero length instruction on
+ * a VM-Exit from an enclave.
+ */
+ if (!instr_len)
+ goto rip_updated;
+
+ WARN(exit_reason.enclave_mode,
+ "KVM: skipping instruction after SGX enclave VM-Exit");
+
orig_rip = kvm_rip_read(vcpu);
- rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ rip = orig_rip + instr_len;
#ifdef CONFIG_X86_64
/*
* We need to mask out the high 32 bits of RIP if not in 64-bit
return 0;
}
+rip_updated:
/* skipping an emulated instruction also counts */
vmx_set_interrupt_shadow(vcpu, 0);
case MSR_IA32_FEAT_CTL:
msr_info->data = vmx->msr_ia32_feature_control;
break;
+ case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+ return 1;
+ msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
+ [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
+ break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested_vmx_allowed(vcpu))
return 1;
vmx->msr_ia32_feature_control = data;
if (msr_info->host_initiated && data == 0)
vmx_leave_nested(vcpu);
+
+ /* SGX may be enabled/disabled by guest's firmware */
+ vmx_write_encls_bitmap(vcpu, NULL);
+ break;
+ case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+ /*
+ * On real hardware, the LE hash MSRs are writable before
+ * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
+ * at which point SGX related bits in IA32_FEATURE_CONTROL
+ * become writable.
+ *
+ * KVM does not emulate SGX activation for simplicity, so
+ * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
+ * is unlocked. This is technically not architectural
+ * behavior, but it's close enough.
+ */
+ if (!msr_info->host_initiated &&
+ (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
+ ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
+ !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
+ return 1;
+ vmx->msr_ia32_sgxlepubkeyhash
+ [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!msr_info->host_initiated)
return 4;
}
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
- int root_level)
+u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
{
u64 eptp = VMX_EPTP_MT_WB;
if (enable_ept_ad_bits &&
(!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
eptp |= VMX_EPTP_AD_ENABLE_BIT;
- eptp |= (root_hpa & PAGE_MASK);
+ eptp |= root_hpa;
return eptp;
}
-static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
- int pgd_level)
+static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+ int root_level)
{
struct kvm *kvm = vcpu->kvm;
bool update_guest_cr3 = true;
u64 eptp;
if (enable_ept) {
- eptp = construct_eptp(vcpu, pgd, pgd_level);
+ eptp = construct_eptp(vcpu, root_hpa, root_level);
vmcs_write64(EPT_POINTER, eptp);
- if (kvm_x86_ops.tlb_remote_flush) {
- spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
- to_vmx(vcpu)->ept_pointer = eptp;
- to_kvm_vmx(kvm)->ept_pointers_match
- = EPT_POINTERS_CHECK;
- spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
- }
+ hv_track_root_ept(vcpu, root_hpa);
if (!enable_unrestricted_guest && !is_paging(vcpu))
guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
update_guest_cr3 = false;
vmx_ept_load_pdptrs(vcpu);
} else {
- guest_cr3 = pgd;
+ guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
}
if (update_guest_cr3)
vmx->secondary_exec_control = exec_control;
}
-static void ept_set_mmio_spte_mask(void)
-{
- /*
- * EPT Misconfigurations can be generated if the value of bits 2:0
- * of an EPT paging-structure entry is 110b (write/execute).
- */
- kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0);
-}
-
#define VMX_XSS_EXIT_BITMAP 0
/*
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
}
- if (cpu_has_vmx_encls_vmexit())
- vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+ vmx_write_encls_bitmap(&vmx->vcpu, NULL);
if (vmx_pt_mode_is_host_guest()) {
memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
return 1;
}
-static int handle_vmcall(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_hypercall(vcpu);
-}
-
-static int handle_invd(struct kvm_vcpu *vcpu)
-{
- /* Treat an INVD instruction as a NOP and just skip it. */
- return kvm_skip_emulated_instruction(vcpu);
-}
-
static int handle_invlpg(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
return kvm_skip_emulated_instruction(vcpu);
}
-static int handle_rdpmc(struct kvm_vcpu *vcpu)
-{
- int err;
-
- err = kvm_rdpmc(vcpu);
- return kvm_complete_insn_gp(vcpu, err);
-}
-
-static int handle_wbinvd(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_wbinvd(vcpu);
-}
-
-static int handle_xsetbv(struct kvm_vcpu *vcpu)
-{
- u64 new_bv = kvm_read_edx_eax(vcpu);
- u32 index = kvm_rcx_read(vcpu);
-
- int err = kvm_set_xcr(vcpu, index, new_bv);
- return kvm_complete_insn_gp(vcpu, err);
-}
-
static int handle_apic_access(struct kvm_vcpu *vcpu)
{
if (likely(fasteoi)) {
{
gpa_t gpa;
+ if (!vmx_can_emulate_instruction(vcpu, NULL, 0))
+ return 1;
+
/*
* A nested guest cannot optimize MMIO vmexits, because we have an
* nGPA here instead of the required GPA.
}
}
-static void vmx_enable_tdp(void)
-{
- kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
- enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
- enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
- 0ull, VMX_EPT_EXECUTABLE_MASK,
- cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
- VMX_EPT_RWX_MASK, 0ull);
-
- ept_set_mmio_spte_mask();
-}
-
/*
* Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
* exiting, so only get here on cpu with PAUSE-Loop-Exiting.
return kvm_skip_emulated_instruction(vcpu);
}
-static int handle_nop(struct kvm_vcpu *vcpu)
-{
- return kvm_skip_emulated_instruction(vcpu);
-}
-
-static int handle_mwait(struct kvm_vcpu *vcpu)
-{
- printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
- return handle_nop(vcpu);
-}
-
-static int handle_invalid_op(struct kvm_vcpu *vcpu)
-{
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
-}
-
static int handle_monitor_trap(struct kvm_vcpu *vcpu)
{
return 1;
}
-static int handle_monitor(struct kvm_vcpu *vcpu)
-{
- printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
- return handle_nop(vcpu);
-}
-
static int handle_invpcid(struct kvm_vcpu *vcpu)
{
u32 vmx_instruction_info;
return 1;
}
+#ifndef CONFIG_X86_SGX_KVM
static int handle_encls(struct kvm_vcpu *vcpu)
{
/*
- * SGX virtualization is not yet supported. There is no software
- * enable bit for SGX, so we have to trap ENCLS and inject a #UD
- * to prevent the guest from executing ENCLS.
+ * SGX virtualization is disabled. There is no software enable bit for
+ * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
+ * the guest from executing ENCLS (when SGX is supported by hardware).
*/
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
+#endif /* CONFIG_X86_SGX_KVM */
static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
{
[EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
[EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
[EXIT_REASON_HLT] = kvm_emulate_halt,
- [EXIT_REASON_INVD] = handle_invd,
+ [EXIT_REASON_INVD] = kvm_emulate_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
- [EXIT_REASON_RDPMC] = handle_rdpmc,
- [EXIT_REASON_VMCALL] = handle_vmcall,
+ [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc,
+ [EXIT_REASON_VMCALL] = kvm_emulate_hypercall,
[EXIT_REASON_VMCLEAR] = handle_vmx_instruction,
[EXIT_REASON_VMLAUNCH] = handle_vmx_instruction,
[EXIT_REASON_VMPTRLD] = handle_vmx_instruction,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_APIC_WRITE] = handle_apic_write,
[EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
- [EXIT_REASON_WBINVD] = handle_wbinvd,
- [EXIT_REASON_XSETBV] = handle_xsetbv,
+ [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd,
+ [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
[EXIT_REASON_GDTR_IDTR] = handle_desc,
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
[EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
[EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
- [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait,
+ [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait,
[EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
- [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
+ [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor,
[EXIT_REASON_INVEPT] = handle_vmx_instruction,
[EXIT_REASON_INVVPID] = handle_vmx_instruction,
- [EXIT_REASON_RDRAND] = handle_invalid_op,
- [EXIT_REASON_RDSEED] = handle_invalid_op,
+ [EXIT_REASON_RDRAND] = kvm_handle_invalid_op,
+ [EXIT_REASON_RDSEED] = kvm_handle_invalid_op,
[EXIT_REASON_PML_FULL] = handle_pml_full,
[EXIT_REASON_INVPCID] = handle_invpcid,
[EXIT_REASON_VMFUNC] = handle_vmx_instruction,
vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
}
-void dump_vmcs(void)
+static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
+{
+ unsigned int i;
+ struct vmx_msr_entry *e;
+
+ pr_err("MSR %s:\n", name);
+ for (i = 0, e = m->val; i < m->nr; ++i, ++e)
+ pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
+}
+
+void dump_vmcs(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 vmentry_ctl, vmexit_ctl;
u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
unsigned long cr4;
- u64 efer;
+ int efer_slot;
if (!dump_invalid_vmcs) {
pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
cr4 = vmcs_readl(GUEST_CR4);
- efer = vmcs_read64(GUEST_IA32_EFER);
secondary_exec_control = 0;
if (cpu_has_secondary_exec_ctrls())
secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
- if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
- (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
- {
+ if (cpu_has_vmx_ept()) {
pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
- if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
- (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
- pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
- efer, vmcs_read64(GUEST_IA32_PAT));
+ efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
+ if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
+ pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
+ else if (efer_slot >= 0)
+ pr_err("EFER= 0x%016llx (autoload)\n",
+ vmx->msr_autoload.guest.val[efer_slot].value);
+ else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
+ pr_err("EFER= 0x%016llx (effective)\n",
+ vcpu->arch.efer | (EFER_LMA | EFER_LME));
+ else
+ pr_err("EFER= 0x%016llx (effective)\n",
+ vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
+ if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
+ pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
vmcs_read64(GUEST_IA32_DEBUGCTL),
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
pr_err("InterruptStatus = %04x\n",
vmcs_read16(GUEST_INTR_STATUS));
+ if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
+ vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
+ if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
+ vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
pr_err("*** Host State ***\n");
pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
vmcs_readl(HOST_IA32_SYSENTER_ESP),
vmcs_read32(HOST_IA32_SYSENTER_CS),
vmcs_readl(HOST_IA32_SYSENTER_EIP));
- if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
- pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
- vmcs_read64(HOST_IA32_EFER),
- vmcs_read64(HOST_IA32_PAT));
+ if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
+ pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
+ if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
+ pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
if (cpu_has_load_perf_global_ctrl() &&
vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
pr_err("PerfGlobCtl = 0x%016llx\n",
vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
+ if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
+ vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
pr_err("*** Control State ***\n");
pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
}
if (exit_reason.failed_vmentry) {
- dump_vmcs();
+ dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= exit_reason.full;
}
if (unlikely(vmx->fail)) {
- dump_vmcs();
+ dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= vmcs_read32(VM_INSTRUCTION_ERROR);
unexpected_vmexit:
vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
exit_reason.full);
- dump_vmcs();
+ dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
else
memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
+ vcpu_setup_sgx_lepubkeyhash(vcpu);
+
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
vmx->pi_desc.nv = POSTED_INTR_VECTOR;
vmx->pi_desc.sn = 1;
- vmx->ept_pointer = INVALID_PAGE;
-
+#if IS_ENABLED(CONFIG_HYPERV)
+ vmx->hv_root_ept = INVALID_PAGE;
+#endif
return 0;
free_vmcs:
static int vmx_vm_init(struct kvm *kvm)
{
- spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
+#if IS_ENABLED(CONFIG_HYPERV)
+ spin_lock_init(&to_kvm_vmx(kvm)->hv_root_ept_lock);
+#endif
if (!ple_gap)
kvm->arch.pause_in_guest = true;
set_cr4_guest_host_mask(vmx);
+ vmx_write_encls_bitmap(vcpu, NULL);
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+ vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
+ else
+ vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+ vmx->msr_ia32_feature_control_valid_bits |=
+ FEAT_CTL_SGX_LC_ENABLED;
+ else
+ vmx->msr_ia32_feature_control_valid_bits &=
+ ~FEAT_CTL_SGX_LC_ENABLED;
+
/* Refresh #PF interception to account for MAXPHYADDR changes. */
vmx_update_exception_bitmap(vcpu);
}
if (vmx_pt_mode_is_host_guest())
kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
+ if (!enable_sgx) {
+ kvm_cpu_cap_clear(X86_FEATURE_SGX);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX1);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+ }
+
if (vmx_umip_emulated())
kvm_cpu_cap_set(X86_FEATURE_UMIP);
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
if (enable_ept)
- vmx_enable_tdp();
+ kvm_mmu_set_ept_masks(enable_ept_ad_bits,
+ cpu_has_vmx_ept_execute_only());
if (!enable_ept)
ept_lpage_level = 0;
if (!enable_ept || !cpu_has_vmx_intel_pt())
pt_mode = PT_MODE_SYSTEM;
+ setup_default_sgx_lepubkeyhash();
+
if (nested) {
nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
vmx_capability.ept);
*/
u64 msr_ia32_feature_control;
u64 msr_ia32_feature_control_valid_bits;
- u64 ept_pointer;
+ /* SGX Launch Control public key hash */
+ u64 msr_ia32_sgxlepubkeyhash[4];
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ u64 hv_root_ept;
+#endif
struct pt_desc pt_desc;
struct lbr_desc lbr_desc;
} shadow_msr_intercept;
};
-enum ept_pointers_status {
- EPT_POINTERS_CHECK = 0,
- EPT_POINTERS_MATCH = 1,
- EPT_POINTERS_MISMATCH = 2
-};
-
struct kvm_vmx {
struct kvm kvm;
bool ept_identity_pagetable_done;
gpa_t ept_identity_map_addr;
- enum ept_pointers_status ept_pointers_match;
- spinlock_t ept_pointer_lock;
+#if IS_ENABLED(CONFIG_HYPERV)
+ hpa_t hv_root_ept;
+ spinlock_t hv_root_ept_lock;
+#endif
};
bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
void ept_save_pdptrs(struct kvm_vcpu *vcpu);
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
- int root_level);
+u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
return is_unrestricted_guest(vcpu) || __vmx_guest_state_valid(vcpu);
}
-void dump_vmcs(void);
+void dump_vmcs(struct kvm_vcpu *vcpu);
#endif /* __KVM_X86_VMX_H */
{
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
"32-bit accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+ "32-bit accessor invalid for 64-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "32-bit accessor invalid for 64-bit high field");
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
"32-bit accessor invalid for natural width field");
}
#include <asm/tlbflush.h>
#include <asm/intel_pt.h>
#include <asm/emulate_prefix.h>
+#include <asm/sgx.h>
#include <clocksource/hyperv_timer.h>
#define CREATE_TRACE_POINTS
VCPU_STAT("l1d_flush", l1d_flush),
VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+ VCPU_STAT("nested_run", nested_run),
+ VCPU_STAT("directed_yield_attempted", directed_yield_attempted),
+ VCPU_STAT("directed_yield_successful", directed_yield_successful),
VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
VM_STAT("mmu_pte_write", mmu_pte_write),
VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
queue:
- if (has_error && !is_protmode(vcpu))
- has_error = false;
if (reinject) {
/*
* On vmentry, vcpu->arch.exception.pending is only
return 0;
}
-int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
{
- if (static_call(kvm_x86_get_cpl)(vcpu) == 0)
- return __kvm_set_xcr(vcpu, index, xcr);
+ if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||
+ __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_set_xcr);
+EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv);
bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
}
EXPORT_SYMBOL_GPL(kvm_get_dr);
-bool kvm_rdpmc(struct kvm_vcpu *vcpu)
+int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
{
u32 ecx = kvm_rcx_read(vcpu);
u64 data;
- int err;
- err = kvm_pmu_rdpmc(vcpu, ecx, &data);
- if (err)
- return err;
+ if (kvm_pmu_rdpmc(vcpu, ecx, &data)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
kvm_rax_write(vcpu, (u32)data);
kvm_rdx_write(vcpu, data >> 32);
- return err;
+ return kvm_skip_emulated_instruction(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_rdpmc);
+EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
/*
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
}
EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
+int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
+{
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_as_nop);
+
+int kvm_emulate_invd(struct kvm_vcpu *vcpu)
+{
+ /* Treat an INVD instruction as a NOP and just skip it. */
+ return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_invd);
+
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
+{
+ pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n");
+ return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
+
+int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_handle_invalid_op);
+
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
+{
+ pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n");
+ return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_monitor);
+
static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
{
xfer_to_guest_mode_prepare();
msr_info->data = 0;
break;
case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+ if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+ return kvm_pmu_get_msr(vcpu, msr_info);
+ if (!msr_info->host_initiated)
+ return 1;
+ msr_info->data = 0;
+ break;
case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
case KVM_CAP_X86_USER_SPACE_MSR:
case KVM_CAP_X86_MSR_FILTER:
case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
+#ifdef CONFIG_X86_SGX_KVM
+ case KVM_CAP_SGX_ATTRIBUTE:
+#endif
+ case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
r = 1;
break;
+ case KVM_CAP_SET_GUEST_DEBUG2:
+ return KVM_GUESTDBG_VALID_MASK;
#ifdef CONFIG_KVM_XEN
case KVM_CAP_XEN_HVM:
r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
kvm_update_pv_runtime(vcpu);
return 0;
-
default:
return -EINVAL;
}
kvm->arch.bus_lock_detection_enabled = true;
r = 0;
break;
+#ifdef CONFIG_X86_SGX_KVM
+ case KVM_CAP_SGX_ATTRIBUTE: {
+ unsigned long allowed_attributes = 0;
+
+ r = sgx_set_attribute(&allowed_attributes, cap->args[0]);
+ if (r)
+ break;
+
+ /* KVM only supports the PROVISIONKEY privileged attribute. */
+ if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) &&
+ !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY))
+ kvm->arch.sgx_provisioning_allowed = true;
+ else
+ r = -EINVAL;
+ break;
+ }
+#endif
+ case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+ r = -EINVAL;
+ if (kvm_x86_ops.vm_copy_enc_context_from)
+ r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]);
+ return r;
default:
r = -EINVAL;
break;
u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
access |= PFERR_WRITE_MASK;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
/* uses this to access any guest's mapped memory without checking CPL */
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
if (r)
goto out_free_percpu;
- kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
- PT_DIRTY_MASK, PT64_NX_MASK, 0,
- PT_PRESENT_MASK, 0, sme_me_mask);
kvm_timer_init();
perf_register_guest_info_callbacks(&kvm_guest_cbs);
}
EXPORT_SYMBOL_GPL(kvm_apicv_init);
-static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
+static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
{
struct kvm_vcpu *target = NULL;
struct kvm_apic_map *map;
+ vcpu->stat.directed_yield_attempted++;
+
rcu_read_lock();
- map = rcu_dereference(kvm->arch.apic_map);
+ map = rcu_dereference(vcpu->kvm->arch.apic_map);
if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
target = map->phys_map[dest_id]->vcpu;
rcu_read_unlock();
- if (target && READ_ONCE(target->ready))
- kvm_vcpu_yield_to(target);
+ if (!target || !READ_ONCE(target->ready))
+ goto no_yield;
+
+ /* Ignore requests to yield to self */
+ if (vcpu == target)
+ goto no_yield;
+
+ if (kvm_vcpu_yield_to(target) <= 0)
+ goto no_yield;
+
+ vcpu->stat.directed_yield_successful++;
+
+no_yield:
+ return;
}
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
break;
kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
- kvm_sched_yield(vcpu->kvm, a1);
+ kvm_sched_yield(vcpu, a1);
ret = 0;
break;
#ifdef CONFIG_X86_64
if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
break;
- kvm_sched_yield(vcpu->kvm, a0);
+ kvm_sched_yield(vcpu, a0);
ret = 0;
break;
default:
static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr);
}
+
+int kvm_check_nested_events(struct kvm_vcpu *vcpu)
+{
+ if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
+ return -EIO;
+
+ if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ kvm_x86_ops.nested_ops->triple_fault(vcpu);
+ return 1;
+ }
+
+ return kvm_x86_ops.nested_ops->check_events(vcpu);
+}
+
+static void kvm_inject_exception(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
+ vcpu->arch.exception.error_code = false;
+ static_call(kvm_x86_queue_exception)(vcpu);
+}
+
static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
{
int r;
/* try to reinject previous events if any */
if (vcpu->arch.exception.injected) {
- static_call(kvm_x86_queue_exception)(vcpu);
+ kvm_inject_exception(vcpu);
can_inject = false;
}
/*
* from L2 to L1.
*/
if (is_guest_mode(vcpu)) {
- r = kvm_x86_ops.nested_ops->check_events(vcpu);
+ r = kvm_check_nested_events(vcpu);
if (r < 0)
goto busy;
}
}
}
- static_call(kvm_x86_queue_exception)(vcpu);
+ kvm_inject_exception(vcpu);
can_inject = false;
}
goto out;
}
if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
- vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
- vcpu->mmio_needed = 0;
- r = 0;
- goto out;
+ if (is_guest_mode(vcpu)) {
+ kvm_x86_ops.nested_ops->triple_fault(vcpu);
+ } else {
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ vcpu->mmio_needed = 0;
+ r = 0;
+ goto out;
+ }
}
if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
/* Page is swapped out. Do synthetic halt */
static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu))
- kvm_x86_ops.nested_ops->check_events(vcpu);
+ kvm_check_nested_events(vcpu);
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted);
return false;
}
+bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
+ return true;
+
+ return false;
+}
+
bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
{
return vcpu->arch.preempted_in_kernel;
fallthrough;
case INVPCID_TYPE_ALL_INCL_GLOBAL:
- kvm_mmu_unload(vcpu);
+ kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
return kvm_skip_emulated_instruction(vcpu);
default:
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
+#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
+({ \
+ bool failed = (consistency_check); \
+ if (failed) \
+ trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
+ failed; \
+})
+
#define KVM_DEFAULT_PLE_GAP 128
#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
#define KVM_DEFAULT_PLE_WINDOW_GROW 2
#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
+int kvm_check_nested_events(struct kvm_vcpu *vcpu);
+
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
{
vcpu->arch.exception.pending = false;
case SEV_CMD_DOWNLOAD_FIRMWARE: return sizeof(struct sev_data_download_firmware);
case SEV_CMD_GET_ID: return sizeof(struct sev_data_get_id);
case SEV_CMD_ATTESTATION_REPORT: return sizeof(struct sev_data_attestation_report);
+ case SEV_CMD_SEND_CANCEL: return sizeof(struct sev_data_send_cancel);
default: return 0;
}
struct sev_device *sev;
unsigned int phys_lsb, phys_msb;
unsigned int reg, ret = 0;
+ int buf_len;
if (!psp || !psp->sev_data)
return -ENODEV;
sev = psp->sev_data;
+ buf_len = sev_cmd_buffer_len(cmd);
+ if (WARN_ON_ONCE(!data != !buf_len))
+ return -EINVAL;
+
+ /*
+ * Copy the incoming data to driver's scratch buffer as __pa() will not
+ * work for some memory, e.g. vmalloc'd addresses, and @data may not be
+ * physically contiguous.
+ */
+ if (data)
+ memcpy(sev->cmd_buf, data, buf_len);
+
/* Get the physical address of the command buffer */
- phys_lsb = data ? lower_32_bits(__psp_pa(data)) : 0;
- phys_msb = data ? upper_32_bits(__psp_pa(data)) : 0;
+ phys_lsb = data ? lower_32_bits(__psp_pa(sev->cmd_buf)) : 0;
+ phys_msb = data ? upper_32_bits(__psp_pa(sev->cmd_buf)) : 0;
dev_dbg(sev->dev, "sev command id %#x buffer 0x%08x%08x timeout %us\n",
cmd, phys_msb, phys_lsb, psp_timeout);
print_hex_dump_debug("(in): ", DUMP_PREFIX_OFFSET, 16, 2, data,
- sev_cmd_buffer_len(cmd), false);
+ buf_len, false);
iowrite32(phys_lsb, sev->io_regs + sev->vdata->cmdbuff_addr_lo_reg);
iowrite32(phys_msb, sev->io_regs + sev->vdata->cmdbuff_addr_hi_reg);
}
print_hex_dump_debug("(out): ", DUMP_PREFIX_OFFSET, 16, 2, data,
- sev_cmd_buffer_len(cmd), false);
+ buf_len, false);
+
+ /*
+ * Copy potential output from the PSP back to data. Do this even on
+ * failure in case the caller wants to glean something from the error.
+ */
+ if (data)
+ memcpy(data, sev->cmd_buf, buf_len);
return ret;
}
static int __sev_platform_init_locked(int *error)
{
struct psp_device *psp = psp_master;
+ struct sev_data_init data;
struct sev_device *sev;
int rc = 0;
if (sev->state == SEV_STATE_INIT)
return 0;
+ memset(&data, 0, sizeof(data));
if (sev_es_tmr) {
u64 tmr_pa;
*/
tmr_pa = __pa(sev_es_tmr);
- sev->init_cmd_buf.flags |= SEV_INIT_FLAGS_SEV_ES;
- sev->init_cmd_buf.tmr_address = tmr_pa;
- sev->init_cmd_buf.tmr_len = SEV_ES_TMR_SIZE;
+ data.flags |= SEV_INIT_FLAGS_SEV_ES;
+ data.tmr_address = tmr_pa;
+ data.tmr_len = SEV_ES_TMR_SIZE;
}
- rc = __sev_do_cmd_locked(SEV_CMD_INIT, &sev->init_cmd_buf, error);
+ rc = __sev_do_cmd_locked(SEV_CMD_INIT, &data, error);
if (rc)
return rc;
static int sev_get_platform_state(int *state, int *error)
{
- struct sev_device *sev = psp_master->sev_data;
+ struct sev_user_data_status data;
int rc;
- rc = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS,
- &sev->status_cmd_buf, error);
+ rc = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS, &data, error);
if (rc)
return rc;
- *state = sev->status_cmd_buf.state;
+ *state = data.state;
return rc;
}
static int sev_ioctl_do_platform_status(struct sev_issue_cmd *argp)
{
- struct sev_device *sev = psp_master->sev_data;
- struct sev_user_data_status *data = &sev->status_cmd_buf;
+ struct sev_user_data_status data;
int ret;
- ret = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS, data, &argp->error);
+ ret = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS, &data, &argp->error);
if (ret)
return ret;
- if (copy_to_user((void __user *)argp->data, data, sizeof(*data)))
+ if (copy_to_user((void __user *)argp->data, &data, sizeof(data)))
ret = -EFAULT;
return ret;
{
struct sev_device *sev = psp_master->sev_data;
struct sev_user_data_pek_csr input;
- struct sev_data_pek_csr *data;
+ struct sev_data_pek_csr data;
void __user *input_address;
void *blob = NULL;
int ret;
if (copy_from_user(&input, (void __user *)argp->data, sizeof(input)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- return -ENOMEM;
+ memset(&data, 0, sizeof(data));
/* userspace wants to query CSR length */
if (!input.address || !input.length)
/* allocate a physically contiguous buffer to store the CSR blob */
input_address = (void __user *)input.address;
- if (input.length > SEV_FW_BLOB_MAX_SIZE) {
- ret = -EFAULT;
- goto e_free;
- }
+ if (input.length > SEV_FW_BLOB_MAX_SIZE)
+ return -EFAULT;
blob = kmalloc(input.length, GFP_KERNEL);
- if (!blob) {
- ret = -ENOMEM;
- goto e_free;
- }
+ if (!blob)
+ return -ENOMEM;
- data->address = __psp_pa(blob);
- data->len = input.length;
+ data.address = __psp_pa(blob);
+ data.len = input.length;
cmd:
if (sev->state == SEV_STATE_UNINIT) {
goto e_free_blob;
}
- ret = __sev_do_cmd_locked(SEV_CMD_PEK_CSR, data, &argp->error);
+ ret = __sev_do_cmd_locked(SEV_CMD_PEK_CSR, &data, &argp->error);
/* If we query the CSR length, FW responded with expected data. */
- input.length = data->len;
+ input.length = data.len;
if (copy_to_user((void __user *)argp->data, &input, sizeof(input))) {
ret = -EFAULT;
e_free_blob:
kfree(blob);
-e_free:
- kfree(data);
return ret;
}
static int sev_get_api_version(void)
{
struct sev_device *sev = psp_master->sev_data;
- struct sev_user_data_status *status;
+ struct sev_user_data_status status;
int error = 0, ret;
- status = &sev->status_cmd_buf;
- ret = sev_platform_status(status, &error);
+ ret = sev_platform_status(&status, &error);
if (ret) {
dev_err(sev->dev,
"SEV: failed to get status. Error: %#x\n", error);
return 1;
}
- sev->api_major = status->api_major;
- sev->api_minor = status->api_minor;
- sev->build = status->build;
- sev->state = status->state;
+ sev->api_major = status.api_major;
+ sev->api_minor = status.api_minor;
+ sev->build = status.build;
+ sev->state = status.state;
return 0;
}
{
struct sev_device *sev = psp_master->sev_data;
struct sev_user_data_pek_cert_import input;
- struct sev_data_pek_cert_import *data;
+ struct sev_data_pek_cert_import data;
void *pek_blob, *oca_blob;
int ret;
if (copy_from_user(&input, (void __user *)argp->data, sizeof(input)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- return -ENOMEM;
-
/* copy PEK certificate blobs from userspace */
pek_blob = psp_copy_user_blob(input.pek_cert_address, input.pek_cert_len);
- if (IS_ERR(pek_blob)) {
- ret = PTR_ERR(pek_blob);
- goto e_free;
- }
+ if (IS_ERR(pek_blob))
+ return PTR_ERR(pek_blob);
- data->pek_cert_address = __psp_pa(pek_blob);
- data->pek_cert_len = input.pek_cert_len;
+ data.reserved = 0;
+ data.pek_cert_address = __psp_pa(pek_blob);
+ data.pek_cert_len = input.pek_cert_len;
/* copy PEK certificate blobs from userspace */
oca_blob = psp_copy_user_blob(input.oca_cert_address, input.oca_cert_len);
goto e_free_pek;
}
- data->oca_cert_address = __psp_pa(oca_blob);
- data->oca_cert_len = input.oca_cert_len;
+ data.oca_cert_address = __psp_pa(oca_blob);
+ data.oca_cert_len = input.oca_cert_len;
/* If platform is not in INIT state then transition it to INIT */
if (sev->state != SEV_STATE_INIT) {
goto e_free_oca;
}
- ret = __sev_do_cmd_locked(SEV_CMD_PEK_CERT_IMPORT, data, &argp->error);
+ ret = __sev_do_cmd_locked(SEV_CMD_PEK_CERT_IMPORT, &data, &argp->error);
e_free_oca:
kfree(oca_blob);
e_free_pek:
kfree(pek_blob);
-e_free:
- kfree(data);
return ret;
}
static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp)
{
struct sev_user_data_get_id2 input;
- struct sev_data_get_id *data;
+ struct sev_data_get_id data;
void __user *input_address;
void *id_blob = NULL;
int ret;
input_address = (void __user *)input.address;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- return -ENOMEM;
-
if (input.address && input.length) {
id_blob = kmalloc(input.length, GFP_KERNEL);
- if (!id_blob) {
- kfree(data);
+ if (!id_blob)
return -ENOMEM;
- }
- data->address = __psp_pa(id_blob);
- data->len = input.length;
+ data.address = __psp_pa(id_blob);
+ data.len = input.length;
+ } else {
+ data.address = 0;
+ data.len = 0;
}
- ret = __sev_do_cmd_locked(SEV_CMD_GET_ID, data, &argp->error);
+ ret = __sev_do_cmd_locked(SEV_CMD_GET_ID, &data, &argp->error);
/*
* Firmware will return the length of the ID value (either the minimum
* required length or the actual length written), return it to the user.
*/
- input.length = data->len;
+ input.length = data.len;
if (copy_to_user((void __user *)argp->data, &input, sizeof(input))) {
ret = -EFAULT;
}
if (id_blob) {
- if (copy_to_user(input_address, id_blob, data->len)) {
+ if (copy_to_user(input_address, id_blob, data.len)) {
ret = -EFAULT;
goto e_free;
}
e_free:
kfree(id_blob);
- kfree(data);
return ret;
}
struct sev_device *sev = psp_master->sev_data;
struct sev_user_data_pdh_cert_export input;
void *pdh_blob = NULL, *cert_blob = NULL;
- struct sev_data_pdh_cert_export *data;
+ struct sev_data_pdh_cert_export data;
void __user *input_cert_chain_address;
void __user *input_pdh_cert_address;
int ret;
if (copy_from_user(&input, (void __user *)argp->data, sizeof(input)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- return -ENOMEM;
+ memset(&data, 0, sizeof(data));
/* Userspace wants to query the certificate length. */
if (!input.pdh_cert_address ||
input_cert_chain_address = (void __user *)input.cert_chain_address;
/* Allocate a physically contiguous buffer to store the PDH blob. */
- if (input.pdh_cert_len > SEV_FW_BLOB_MAX_SIZE) {
- ret = -EFAULT;
- goto e_free;
- }
+ if (input.pdh_cert_len > SEV_FW_BLOB_MAX_SIZE)
+ return -EFAULT;
/* Allocate a physically contiguous buffer to store the cert chain blob. */
- if (input.cert_chain_len > SEV_FW_BLOB_MAX_SIZE) {
- ret = -EFAULT;
- goto e_free;
- }
+ if (input.cert_chain_len > SEV_FW_BLOB_MAX_SIZE)
+ return -EFAULT;
pdh_blob = kmalloc(input.pdh_cert_len, GFP_KERNEL);
- if (!pdh_blob) {
- ret = -ENOMEM;
- goto e_free;
- }
+ if (!pdh_blob)
+ return -ENOMEM;
- data->pdh_cert_address = __psp_pa(pdh_blob);
- data->pdh_cert_len = input.pdh_cert_len;
+ data.pdh_cert_address = __psp_pa(pdh_blob);
+ data.pdh_cert_len = input.pdh_cert_len;
cert_blob = kmalloc(input.cert_chain_len, GFP_KERNEL);
if (!cert_blob) {
goto e_free_pdh;
}
- data->cert_chain_address = __psp_pa(cert_blob);
- data->cert_chain_len = input.cert_chain_len;
+ data.cert_chain_address = __psp_pa(cert_blob);
+ data.cert_chain_len = input.cert_chain_len;
cmd:
- ret = __sev_do_cmd_locked(SEV_CMD_PDH_CERT_EXPORT, data, &argp->error);
+ ret = __sev_do_cmd_locked(SEV_CMD_PDH_CERT_EXPORT, &data, &argp->error);
/* If we query the length, FW responded with expected data. */
- input.cert_chain_len = data->cert_chain_len;
- input.pdh_cert_len = data->pdh_cert_len;
+ input.cert_chain_len = data.cert_chain_len;
+ input.pdh_cert_len = data.pdh_cert_len;
if (copy_to_user((void __user *)argp->data, &input, sizeof(input))) {
ret = -EFAULT;
kfree(cert_blob);
e_free_pdh:
kfree(pdh_blob);
-e_free:
- kfree(data);
return ret;
}
if (!sev)
goto e_err;
+ sev->cmd_buf = (void *)devm_get_free_pages(dev, GFP_KERNEL, 0);
+ if (!sev->cmd_buf)
+ goto e_sev;
+
psp->sev_data = sev;
sev->dev = dev;
if (!sev->vdata) {
ret = -ENODEV;
dev_err(dev, "sev: missing driver data\n");
- goto e_err;
+ goto e_buf;
}
psp_set_sev_irq_handler(psp, sev_irq_handler, sev);
e_irq:
psp_clear_sev_irq_handler(psp);
+e_buf:
+ devm_free_pages(dev, (unsigned long)sev->cmd_buf);
+e_sev:
+ devm_kfree(dev, sev);
e_err:
psp->sev_data = NULL;
unsigned int int_rcvd;
wait_queue_head_t int_queue;
struct sev_misc_dev *misc;
- struct sev_user_data_status status_cmd_buf;
- struct sev_data_init init_cmd_buf;
u8 api_major;
u8 api_minor;
u8 build;
+
+ void *cmd_buf;
};
int sev_dev_init(struct psp_device *psp);
int len, void *val);
int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
int len, struct kvm_io_device *dev);
-void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
- struct kvm_io_device *dev);
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ struct kvm_io_device *dev);
struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
gpa_t addr);
int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
#endif
+#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
+struct kvm_gfn_range {
+ struct kvm_memory_slot *slot;
+ gfn_t start;
+ gfn_t end;
+ pte_t pte;
+ bool may_block;
+};
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+#endif
+
enum {
OUTSIDE_GUEST_MODE,
IN_GUEST_MODE,
void kvm_get_kvm(struct kvm *kvm);
void kvm_put_kvm(struct kvm *kvm);
+bool file_is_kvm(struct file *file);
void kvm_put_kvm_no_destroy(struct kvm *kvm);
static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
- struct kvm_memory_slot *memslot);
+ const struct kvm_memory_slot *memslot);
#else /* !CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log);
int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu);
int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu);
+bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
int kvm_arch_post_init_vm(struct kvm *kvm);
void kvm_arch_pre_destroy_vm(struct kvm *kvm);
}
static inline unsigned long
-__gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+__gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
{
return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
}
SEV_CMD_SEND_UPDATE_DATA = 0x041,
SEV_CMD_SEND_UPDATE_VMSA = 0x042,
SEV_CMD_SEND_FINISH = 0x043,
+ SEV_CMD_SEND_CANCEL = 0x044,
/* Guest migration commands (incoming) */
SEV_CMD_RECEIVE_START = 0x050,
u64 pdh_cert_address; /* In */
u32 pdh_cert_len; /* In */
u32 reserved1;
- u64 plat_cert_address; /* In */
- u32 plat_cert_len; /* In */
+ u64 plat_certs_address; /* In */
+ u32 plat_certs_len; /* In */
u32 reserved2;
- u64 amd_cert_address; /* In */
- u32 amd_cert_len; /* In */
+ u64 amd_certs_address; /* In */
+ u32 amd_certs_len; /* In */
u32 reserved3;
u64 session_address; /* In */
u32 session_len; /* In/Out */
u32 handle; /* In */
} __packed;
+/**
+ * struct sev_data_send_cancel - SEND_CANCEL command parameters
+ *
+ * @handle: handle of the VM to process
+ */
+struct sev_data_send_cancel {
+ u32 handle; /* In */
+} __packed;
+
/**
* struct sev_data_receive_start - RECEIVE_START command parameters
*
TP_printk("%s", __print_symbolic(__entry->load, kvm_fpu_load_symbol))
);
-TRACE_EVENT(kvm_age_page,
- TP_PROTO(ulong gfn, int level, struct kvm_memory_slot *slot, int ref),
- TP_ARGS(gfn, level, slot, ref),
-
- TP_STRUCT__entry(
- __field( u64, hva )
- __field( u64, gfn )
- __field( u8, level )
- __field( u8, referenced )
- ),
-
- TP_fast_assign(
- __entry->gfn = gfn;
- __entry->level = level;
- __entry->hva = ((gfn - slot->base_gfn) <<
- PAGE_SHIFT) + slot->userspace_addr;
- __entry->referenced = ref;
- ),
-
- TP_printk("hva %llx gfn %llx level %u %s",
- __entry->hva, __entry->gfn, __entry->level,
- __entry->referenced ? "YOUNG" : "OLD")
-);
-
#ifdef CONFIG_KVM_ASYNC_PF
DECLARE_EVENT_CLASS(kvm_async_get_page_class,
TP_printk("vcpu %d", __entry->vcpu_id)
);
+TRACE_EVENT(kvm_unmap_hva_range,
+ TP_PROTO(unsigned long start, unsigned long end),
+ TP_ARGS(start, end),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, start )
+ __field( unsigned long, end )
+ ),
+
+ TP_fast_assign(
+ __entry->start = start;
+ __entry->end = end;
+ ),
+
+ TP_printk("mmu notifier unmap range: %#016lx -- %#016lx",
+ __entry->start, __entry->end)
+);
+
+TRACE_EVENT(kvm_set_spte_hva,
+ TP_PROTO(unsigned long hva),
+ TP_ARGS(hva),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, hva )
+ ),
+
+ TP_fast_assign(
+ __entry->hva = hva;
+ ),
+
+ TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva)
+);
+
+TRACE_EVENT(kvm_age_hva,
+ TP_PROTO(unsigned long start, unsigned long end),
+ TP_ARGS(start, end),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, start )
+ __field( unsigned long, end )
+ ),
+
+ TP_fast_assign(
+ __entry->start = start;
+ __entry->end = end;
+ ),
+
+ TP_printk("mmu notifier age hva: %#016lx -- %#016lx",
+ __entry->start, __entry->end)
+);
+
+TRACE_EVENT(kvm_test_age_hva,
+ TP_PROTO(unsigned long hva),
+ TP_ARGS(hva),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, hva )
+ ),
+
+ TP_fast_assign(
+ __entry->hva = hva;
+ ),
+
+ TP_printk("mmu notifier test age hva: %#016lx", __entry->hva)
+);
+
#endif /* _TRACE_KVM_MAIN_H */
/* This part must be outside protection */
#define KVM_CAP_DIRTY_LOG_RING 192
#define KVM_CAP_X86_BUS_LOCK_EXIT 193
#define KVM_CAP_PPC_DAWR1 194
+#define KVM_CAP_SET_GUEST_DEBUG2 195
+#define KVM_CAP_SGX_ATTRIBUTE 196
+#define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197
#ifdef KVM_CAP_IRQ_ROUTING
KVM_SEV_CERT_EXPORT,
/* Attestation report */
KVM_SEV_GET_ATTESTATION_REPORT,
+ /* Guest Migration Extension */
+ KVM_SEV_SEND_CANCEL,
KVM_SEV_NR_MAX,
};
__u32 len;
};
+struct kvm_sev_send_start {
+ __u32 policy;
+ __u64 pdh_cert_uaddr;
+ __u32 pdh_cert_len;
+ __u64 plat_certs_uaddr;
+ __u32 plat_certs_len;
+ __u64 amd_certs_uaddr;
+ __u32 amd_certs_len;
+ __u64 session_uaddr;
+ __u32 session_len;
+};
+
+struct kvm_sev_send_update_data {
+ __u64 hdr_uaddr;
+ __u32 hdr_len;
+ __u64 guest_uaddr;
+ __u32 guest_len;
+ __u64 trans_uaddr;
+ __u32 trans_len;
+};
+
+struct kvm_sev_receive_start {
+ __u32 handle;
+ __u32 policy;
+ __u64 pdh_uaddr;
+ __u32 pdh_len;
+ __u64 session_uaddr;
+ __u32 session_len;
+};
+
+struct kvm_sev_receive_update_data {
+ __u64 hdr_uaddr;
+ __u32 hdr_len;
+ __u64 guest_uaddr;
+ __u32 guest_len;
+ __u64 trans_uaddr;
+ __u32 trans_len;
+};
+
#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
#define HUGETLB_FLAG_ENCODE_SHIFT 26
#define HUGETLB_FLAG_ENCODE_MASK 0x3f
+#define HUGETLB_FLAG_ENCODE_16KB (14 << HUGETLB_FLAG_ENCODE_SHIFT)
#define HUGETLB_FLAG_ENCODE_64KB (16 << HUGETLB_FLAG_ENCODE_SHIFT)
#define HUGETLB_FLAG_ENCODE_512KB (19 << HUGETLB_FLAG_ENCODE_SHIFT)
#define HUGETLB_FLAG_ENCODE_1MB (20 << HUGETLB_FLAG_ENCODE_SHIFT)
#define HUGETLB_FLAG_ENCODE_2MB (21 << HUGETLB_FLAG_ENCODE_SHIFT)
#define HUGETLB_FLAG_ENCODE_8MB (23 << HUGETLB_FLAG_ENCODE_SHIFT)
#define HUGETLB_FLAG_ENCODE_16MB (24 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_32MB (25 << HUGETLB_FLAG_ENCODE_SHIFT)
#define HUGETLB_FLAG_ENCODE_256MB (28 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_512MB (29 << HUGETLB_FLAG_ENCODE_SHIFT)
#define HUGETLB_FLAG_ENCODE_1GB (30 << HUGETLB_FLAG_ENCODE_SHIFT)
#define HUGETLB_FLAG_ENCODE_2GB (31 << HUGETLB_FLAG_ENCODE_SHIFT)
#define HUGETLB_FLAG_ENCODE_16GB (34 << HUGETLB_FLAG_ENCODE_SHIFT)
/dirty_log_perf_test
/hardware_disable_test
/kvm_create_max_vcpus
+/kvm_page_table_test
/memslot_modification_stress_test
/set_memory_region_test
/steal_time
TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
TEST_GEN_PROGS_x86_64 += hardware_disable_test
TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_x86_64 += kvm_page_table_test
TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
TEST_GEN_PROGS_x86_64 += set_memory_region_test
TEST_GEN_PROGS_x86_64 += steal_time
TEST_GEN_PROGS_aarch64 += dirty_log_test
TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_aarch64 += kvm_page_table_test
TEST_GEN_PROGS_aarch64 += set_memory_region_test
TEST_GEN_PROGS_aarch64 += steal_time
TEST_GEN_PROGS_s390x += demand_paging_test
TEST_GEN_PROGS_s390x += dirty_log_test
TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
+TEST_GEN_PROGS_s390x += kvm_page_table_test
TEST_GEN_PROGS_s390x += set_memory_region_test
TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <asm/barrier.h>
+#include <linux/atomic.h>
#include "kvm_util.h"
#include "test_util.h"
static uint64_t host_track_next_count;
/* Whether dirty ring reset is requested, or finished */
-static sem_t dirty_ring_vcpu_stop;
-static sem_t dirty_ring_vcpu_cont;
+static sem_t sem_vcpu_stop;
+static sem_t sem_vcpu_cont;
+/*
+ * This is only set by main thread, and only cleared by vcpu thread. It is
+ * used to request vcpu thread to stop at the next GUEST_SYNC, since GUEST_SYNC
+ * is the only place that we'll guarantee both "dirty bit" and "dirty data"
+ * will match. E.g., SIG_IPI won't guarantee that if the vcpu is interrupted
+ * after setting dirty bit but before the data is written.
+ */
+static atomic_t vcpu_sync_stop_requested;
/*
* This is updated by the vcpu thread to tell the host whether it's a
* ring-full event. It should only be read until a sem_wait() of
- * dirty_ring_vcpu_stop and before vcpu continues to run.
+ * sem_vcpu_stop and before vcpu continues to run.
*/
static bool dirty_ring_vcpu_ring_full;
/*
kvm_vm_clear_dirty_log(vm, slot, bitmap, 0, num_pages);
}
+/* Should only be called after a GUEST_SYNC */
+static void vcpu_handle_sync_stop(void)
+{
+ if (atomic_read(&vcpu_sync_stop_requested)) {
+ /* It means main thread is sleeping waiting */
+ atomic_set(&vcpu_sync_stop_requested, false);
+ sem_post(&sem_vcpu_stop);
+ sem_wait_until(&sem_vcpu_cont);
+ }
+}
+
static void default_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
{
struct kvm_run *run = vcpu_state(vm, VCPU_ID);
TEST_ASSERT(get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC,
"Invalid guest sync status: exit_reason=%s\n",
exit_reason_str(run->exit_reason));
+
+ vcpu_handle_sync_stop();
}
static bool dirty_ring_supported(void)
{
/* This makes sure that hardware PML cache flushed */
vcpu_kick();
- sem_wait_until(&dirty_ring_vcpu_stop);
+ sem_wait_until(&sem_vcpu_stop);
}
static void dirty_ring_continue_vcpu(void)
{
pr_info("Notifying vcpu to continue\n");
- sem_post(&dirty_ring_vcpu_cont);
+ sem_post(&sem_vcpu_cont);
}
static void dirty_ring_collect_dirty_pages(struct kvm_vm *vm, int slot,
/* Update the flag first before pause */
WRITE_ONCE(dirty_ring_vcpu_ring_full,
run->exit_reason == KVM_EXIT_DIRTY_RING_FULL);
- sem_post(&dirty_ring_vcpu_stop);
+ sem_post(&sem_vcpu_stop);
pr_info("vcpu stops because %s...\n",
dirty_ring_vcpu_ring_full ?
"dirty ring is full" : "vcpu is kicked out");
- sem_wait_until(&dirty_ring_vcpu_cont);
+ sem_wait_until(&sem_vcpu_cont);
pr_info("vcpu continues now.\n");
} else {
TEST_ASSERT(false, "Invalid guest sync status: "
static void dirty_ring_before_vcpu_join(void)
{
/* Kick another round of vcpu just to make sure it will quit */
- sem_post(&dirty_ring_vcpu_cont);
+ sem_post(&sem_vcpu_cont);
}
struct log_mode {
*/
sigmask->len = 8;
pthread_sigmask(0, NULL, sigset);
+ sigdelset(sigset, SIG_IPI);
vcpu_ioctl(vm, VCPU_ID, KVM_SET_SIGNAL_MASK, sigmask);
- sigaddset(sigset, SIG_IPI);
- pthread_sigmask(SIG_BLOCK, sigset, NULL);
sigemptyset(sigset);
sigaddset(sigset, SIG_IPI);
usleep(p->interval * 1000);
log_mode_collect_dirty_pages(vm, TEST_MEM_SLOT_INDEX,
bmap, host_num_pages);
+
+ /*
+ * See vcpu_sync_stop_requested definition for details on why
+ * we need to stop vcpu when verify data.
+ */
+ atomic_set(&vcpu_sync_stop_requested, true);
+ sem_wait_until(&sem_vcpu_stop);
+ /*
+ * NOTE: for dirty ring, it's possible that we didn't stop at
+ * GUEST_SYNC but instead we stopped because ring is full;
+ * that's okay too because ring full means we're only missing
+ * the flush of the last page, and since we handle the last
+ * page specially verification will succeed anyway.
+ */
+ assert(host_log_mode == LOG_MODE_DIRTY_RING ||
+ atomic_read(&vcpu_sync_stop_requested) == false);
vm_dirty_log_verify(mode, bmap);
+ sem_post(&sem_vcpu_cont);
+
iteration++;
sync_global_to_guest(vm, iteration);
}
.interval = TEST_HOST_LOOP_INTERVAL,
};
int opt, i;
+ sigset_t sigset;
- sem_init(&dirty_ring_vcpu_stop, 0, 0);
- sem_init(&dirty_ring_vcpu_cont, 0, 0);
+ sem_init(&sem_vcpu_stop, 0, 0);
+ sem_init(&sem_vcpu_cont, 0, 0);
guest_modes_append_default();
srandom(time(0));
+ /* Ensure that vCPU threads start with SIG_IPI blocked. */
+ sigemptyset(&sigset);
+ sigaddset(&sigset, SIG_IPI);
+ pthread_sigmask(SIG_BLOCK, &sigset, NULL);
+
if (host_log_mode_option == LOG_MODE_ALL) {
/* Run each log mode */
for (i = 0; i < LOG_MODE_NUM; i++) {
#define MIN_PAGE_SIZE (1U << MIN_PAGE_SHIFT)
#define PTES_PER_MIN_PAGE ptes_per_page(MIN_PAGE_SIZE)
-#define vm_guest_mode_string(m) vm_guest_mode_string[m]
-extern const char * const vm_guest_mode_string[];
-
struct vm_guest_mode_params {
unsigned int pa_bits;
unsigned int va_bits;
int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
struct kvm_enable_cap *cap);
void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size);
+const char *vm_guest_mode_string(uint32_t i);
struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
void kvm_vm_free(struct kvm_vm *vmp);
VM_MEM_SRC_ANONYMOUS,
VM_MEM_SRC_ANONYMOUS_THP,
VM_MEM_SRC_ANONYMOUS_HUGETLB,
+ VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB,
+ VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB,
+ VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB,
+ VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB,
+ VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB,
+ VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB,
+ VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB,
+ VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB,
+ VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB,
+ VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB,
+ VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB,
+ VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB,
+ VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB,
+ NUM_SRC_TYPES,
};
struct vm_mem_backing_src_alias {
const char *name;
- enum vm_mem_backing_src_type type;
+ uint32_t flag;
};
+bool thp_configured(void);
+size_t get_trans_hugepagesz(void);
+size_t get_def_hugetlb_pagesz(void);
+const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i);
+size_t get_backing_src_pagesz(uint32_t i);
void backing_src_help(void);
enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM page table test
+ *
+ * Copyright (C) 2021, Huawei, Inc.
+ *
+ * Make sure that THP has been enabled or enough HUGETLB pages with specific
+ * page size have been pre-allocated on your system, if you are planning to
+ * use hugepages to back the guest memory for testing.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <pthread.h>
+#include <semaphore.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "guest_modes.h"
+
+#define TEST_MEM_SLOT_INDEX 1
+
+/* Default size(1GB) of the memory for testing */
+#define DEFAULT_TEST_MEM_SIZE (1 << 30)
+
+/* Default guest test virtual memory offset */
+#define DEFAULT_GUEST_TEST_MEM 0xc0000000
+
+/* Different guest memory accessing stages */
+enum test_stage {
+ KVM_BEFORE_MAPPINGS,
+ KVM_CREATE_MAPPINGS,
+ KVM_UPDATE_MAPPINGS,
+ KVM_ADJUST_MAPPINGS,
+ NUM_TEST_STAGES,
+};
+
+static const char * const test_stage_string[] = {
+ "KVM_BEFORE_MAPPINGS",
+ "KVM_CREATE_MAPPINGS",
+ "KVM_UPDATE_MAPPINGS",
+ "KVM_ADJUST_MAPPINGS",
+};
+
+struct vcpu_args {
+ int vcpu_id;
+ bool vcpu_write;
+};
+
+struct test_args {
+ struct kvm_vm *vm;
+ uint64_t guest_test_virt_mem;
+ uint64_t host_page_size;
+ uint64_t host_num_pages;
+ uint64_t large_page_size;
+ uint64_t large_num_pages;
+ uint64_t host_pages_per_lpage;
+ enum vm_mem_backing_src_type src_type;
+ struct vcpu_args vcpu_args[KVM_MAX_VCPUS];
+};
+
+/*
+ * Guest variables. Use addr_gva2hva() if these variables need
+ * to be changed in host.
+ */
+static enum test_stage guest_test_stage;
+
+/* Host variables */
+static uint32_t nr_vcpus = 1;
+static struct test_args test_args;
+static enum test_stage *current_stage;
+static bool host_quit;
+
+/* Whether the test stage is updated, or completed */
+static sem_t test_stage_updated;
+static sem_t test_stage_completed;
+
+/*
+ * Guest physical memory offset of the testing memory slot.
+ * This will be set to the topmost valid physical address minus
+ * the test memory size.
+ */
+static uint64_t guest_test_phys_mem;
+
+/*
+ * Guest virtual memory offset of the testing memory slot.
+ * Must not conflict with identity mapped test code.
+ */
+static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM;
+
+static void guest_code(int vcpu_id)
+{
+ struct test_args *p = &test_args;
+ struct vcpu_args *vcpu_args = &p->vcpu_args[vcpu_id];
+ enum test_stage *current_stage = &guest_test_stage;
+ uint64_t addr;
+ int i, j;
+
+ /* Make sure vCPU args data structure is not corrupt */
+ GUEST_ASSERT(vcpu_args->vcpu_id == vcpu_id);
+
+ while (true) {
+ addr = p->guest_test_virt_mem;
+
+ switch (READ_ONCE(*current_stage)) {
+ /*
+ * All vCPU threads will be started in this stage,
+ * where guest code of each vCPU will do nothing.
+ */
+ case KVM_BEFORE_MAPPINGS:
+ break;
+
+ /*
+ * Before dirty logging, vCPUs concurrently access the first
+ * 8 bytes of each page (host page/large page) within the same
+ * memory region with different accessing types (read/write).
+ * Then KVM will create normal page mappings or huge block
+ * mappings for them.
+ */
+ case KVM_CREATE_MAPPINGS:
+ for (i = 0; i < p->large_num_pages; i++) {
+ if (vcpu_args->vcpu_write)
+ *(uint64_t *)addr = 0x0123456789ABCDEF;
+ else
+ READ_ONCE(*(uint64_t *)addr);
+
+ addr += p->large_page_size;
+ }
+ break;
+
+ /*
+ * During dirty logging, KVM will only update attributes of the
+ * normal page mappings from RO to RW if memory backing src type
+ * is anonymous. In other cases, KVM will split the huge block
+ * mappings into normal page mappings if memory backing src type
+ * is THP or HUGETLB.
+ */
+ case KVM_UPDATE_MAPPINGS:
+ if (p->src_type == VM_MEM_SRC_ANONYMOUS) {
+ for (i = 0; i < p->host_num_pages; i++) {
+ *(uint64_t *)addr = 0x0123456789ABCDEF;
+ addr += p->host_page_size;
+ }
+ break;
+ }
+
+ for (i = 0; i < p->large_num_pages; i++) {
+ /*
+ * Write to the first host page in each large
+ * page region, and triger break of large pages.
+ */
+ *(uint64_t *)addr = 0x0123456789ABCDEF;
+
+ /*
+ * Access the middle host pages in each large
+ * page region. Since dirty logging is enabled,
+ * this will create new mappings at the smallest
+ * granularity.
+ */
+ addr += p->large_page_size / 2;
+ for (j = 0; j < p->host_pages_per_lpage / 2; j++) {
+ READ_ONCE(*(uint64_t *)addr);
+ addr += p->host_page_size;
+ }
+ }
+ break;
+
+ /*
+ * After dirty logging is stopped, vCPUs concurrently read
+ * from every single host page. Then KVM will coalesce the
+ * split page mappings back to block mappings. And a TLB
+ * conflict abort could occur here if TLB entries of the
+ * page mappings are not fully invalidated.
+ */
+ case KVM_ADJUST_MAPPINGS:
+ for (i = 0; i < p->host_num_pages; i++) {
+ READ_ONCE(*(uint64_t *)addr);
+ addr += p->host_page_size;
+ }
+ break;
+
+ default:
+ GUEST_ASSERT(0);
+ }
+
+ GUEST_SYNC(1);
+ }
+}
+
+static void *vcpu_worker(void *data)
+{
+ int ret;
+ struct vcpu_args *vcpu_args = data;
+ struct kvm_vm *vm = test_args.vm;
+ int vcpu_id = vcpu_args->vcpu_id;
+ struct kvm_run *run;
+ struct timespec start;
+ struct timespec ts_diff;
+ enum test_stage stage;
+
+ vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
+ run = vcpu_state(vm, vcpu_id);
+
+ while (!READ_ONCE(host_quit)) {
+ ret = sem_wait(&test_stage_updated);
+ TEST_ASSERT(ret == 0, "Error in sem_wait");
+
+ if (READ_ONCE(host_quit))
+ return NULL;
+
+ clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+ ret = _vcpu_run(vm, vcpu_id);
+ ts_diff = timespec_elapsed(start);
+
+ TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
+ TEST_ASSERT(get_ucall(vm, vcpu_id, NULL) == UCALL_SYNC,
+ "Invalid guest sync status: exit_reason=%s\n",
+ exit_reason_str(run->exit_reason));
+
+ pr_debug("Got sync event from vCPU %d\n", vcpu_id);
+ stage = READ_ONCE(*current_stage);
+
+ /*
+ * Here we can know the execution time of every
+ * single vcpu running in different test stages.
+ */
+ pr_debug("vCPU %d has completed stage %s\n"
+ "execution time is: %ld.%.9lds\n\n",
+ vcpu_id, test_stage_string[stage],
+ ts_diff.tv_sec, ts_diff.tv_nsec);
+
+ ret = sem_post(&test_stage_completed);
+ TEST_ASSERT(ret == 0, "Error in sem_post");
+ }
+
+ return NULL;
+}
+
+struct test_params {
+ uint64_t phys_offset;
+ uint64_t test_mem_size;
+ enum vm_mem_backing_src_type src_type;
+};
+
+static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg)
+{
+ int ret;
+ struct test_params *p = arg;
+ struct vcpu_args *vcpu_args;
+ enum vm_mem_backing_src_type src_type = p->src_type;
+ uint64_t large_page_size = get_backing_src_pagesz(src_type);
+ uint64_t guest_page_size = vm_guest_mode_params[mode].page_size;
+ uint64_t host_page_size = getpagesize();
+ uint64_t test_mem_size = p->test_mem_size;
+ uint64_t guest_num_pages;
+ uint64_t alignment;
+ void *host_test_mem;
+ struct kvm_vm *vm;
+ int vcpu_id;
+
+ /* Align up the test memory size */
+ alignment = max(large_page_size, guest_page_size);
+ test_mem_size = (test_mem_size + alignment - 1) & ~(alignment - 1);
+
+ /* Create a VM with enough guest pages */
+ guest_num_pages = test_mem_size / guest_page_size;
+ vm = vm_create_with_vcpus(mode, nr_vcpus,
+ guest_num_pages, 0, guest_code, NULL);
+
+ /* Align down GPA of the testing memslot */
+ if (!p->phys_offset)
+ guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) *
+ guest_page_size;
+ else
+ guest_test_phys_mem = p->phys_offset;
+#ifdef __s390x__
+ alignment = max(0x100000, alignment);
+#endif
+ guest_test_phys_mem &= ~(alignment - 1);
+
+ /* Set up the shared data structure test_args */
+ test_args.vm = vm;
+ test_args.guest_test_virt_mem = guest_test_virt_mem;
+ test_args.host_page_size = host_page_size;
+ test_args.host_num_pages = test_mem_size / host_page_size;
+ test_args.large_page_size = large_page_size;
+ test_args.large_num_pages = test_mem_size / large_page_size;
+ test_args.host_pages_per_lpage = large_page_size / host_page_size;
+ test_args.src_type = src_type;
+
+ for (vcpu_id = 0; vcpu_id < KVM_MAX_VCPUS; vcpu_id++) {
+ vcpu_args = &test_args.vcpu_args[vcpu_id];
+ vcpu_args->vcpu_id = vcpu_id;
+ vcpu_args->vcpu_write = !(vcpu_id % 2);
+ }
+
+ /* Add an extra memory slot with specified backing src type */
+ vm_userspace_mem_region_add(vm, src_type, guest_test_phys_mem,
+ TEST_MEM_SLOT_INDEX, guest_num_pages, 0);
+
+ /* Do mapping(GVA->GPA) for the testing memory slot */
+ virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0);
+
+ /* Cache the HVA pointer of the region */
+ host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem);
+
+ /* Export shared structure test_args to guest */
+ ucall_init(vm, NULL);
+ sync_global_to_guest(vm, test_args);
+
+ ret = sem_init(&test_stage_updated, 0, 0);
+ TEST_ASSERT(ret == 0, "Error in sem_init");
+
+ ret = sem_init(&test_stage_completed, 0, 0);
+ TEST_ASSERT(ret == 0, "Error in sem_init");
+
+ current_stage = addr_gva2hva(vm, (vm_vaddr_t)(&guest_test_stage));
+ *current_stage = NUM_TEST_STAGES;
+
+ pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
+ pr_info("Testing memory backing src type: %s\n",
+ vm_mem_backing_src_alias(src_type)->name);
+ pr_info("Testing memory backing src granularity: 0x%lx\n",
+ large_page_size);
+ pr_info("Testing memory size(aligned): 0x%lx\n", test_mem_size);
+ pr_info("Guest physical test memory offset: 0x%lx\n",
+ guest_test_phys_mem);
+ pr_info("Host virtual test memory offset: 0x%lx\n",
+ (uint64_t)host_test_mem);
+ pr_info("Number of testing vCPUs: %d\n", nr_vcpus);
+
+ return vm;
+}
+
+static void vcpus_complete_new_stage(enum test_stage stage)
+{
+ int ret;
+ int vcpus;
+
+ /* Wake up all the vcpus to run new test stage */
+ for (vcpus = 0; vcpus < nr_vcpus; vcpus++) {
+ ret = sem_post(&test_stage_updated);
+ TEST_ASSERT(ret == 0, "Error in sem_post");
+ }
+ pr_debug("All vcpus have been notified to continue\n");
+
+ /* Wait for all the vcpus to complete new test stage */
+ for (vcpus = 0; vcpus < nr_vcpus; vcpus++) {
+ ret = sem_wait(&test_stage_completed);
+ TEST_ASSERT(ret == 0, "Error in sem_wait");
+
+ pr_debug("%d vcpus have completed stage %s\n",
+ vcpus + 1, test_stage_string[stage]);
+ }
+
+ pr_debug("All vcpus have completed stage %s\n",
+ test_stage_string[stage]);
+}
+
+static void run_test(enum vm_guest_mode mode, void *arg)
+{
+ int ret;
+ pthread_t *vcpu_threads;
+ struct kvm_vm *vm;
+ int vcpu_id;
+ struct timespec start;
+ struct timespec ts_diff;
+
+ /* Create VM with vCPUs and make some pre-initialization */
+ vm = pre_init_before_test(mode, arg);
+
+ vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
+ TEST_ASSERT(vcpu_threads, "Memory allocation failed");
+
+ host_quit = false;
+ *current_stage = KVM_BEFORE_MAPPINGS;
+
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+ pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
+ &test_args.vcpu_args[vcpu_id]);
+ }
+
+ vcpus_complete_new_stage(*current_stage);
+ pr_info("Started all vCPUs successfully\n");
+
+ /* Test the stage of KVM creating mappings */
+ *current_stage = KVM_CREATE_MAPPINGS;
+
+ clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+ vcpus_complete_new_stage(*current_stage);
+ ts_diff = timespec_elapsed(start);
+
+ pr_info("KVM_CREATE_MAPPINGS: total execution time: %ld.%.9lds\n\n",
+ ts_diff.tv_sec, ts_diff.tv_nsec);
+
+ /* Test the stage of KVM updating mappings */
+ vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX,
+ KVM_MEM_LOG_DIRTY_PAGES);
+
+ *current_stage = KVM_UPDATE_MAPPINGS;
+
+ clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+ vcpus_complete_new_stage(*current_stage);
+ ts_diff = timespec_elapsed(start);
+
+ pr_info("KVM_UPDATE_MAPPINGS: total execution time: %ld.%.9lds\n\n",
+ ts_diff.tv_sec, ts_diff.tv_nsec);
+
+ /* Test the stage of KVM adjusting mappings */
+ vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX, 0);
+
+ *current_stage = KVM_ADJUST_MAPPINGS;
+
+ clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+ vcpus_complete_new_stage(*current_stage);
+ ts_diff = timespec_elapsed(start);
+
+ pr_info("KVM_ADJUST_MAPPINGS: total execution time: %ld.%.9lds\n\n",
+ ts_diff.tv_sec, ts_diff.tv_nsec);
+
+ /* Tell the vcpu thread to quit */
+ host_quit = true;
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+ ret = sem_post(&test_stage_updated);
+ TEST_ASSERT(ret == 0, "Error in sem_post");
+ }
+
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++)
+ pthread_join(vcpu_threads[vcpu_id], NULL);
+
+ ret = sem_destroy(&test_stage_updated);
+ TEST_ASSERT(ret == 0, "Error in sem_destroy");
+
+ ret = sem_destroy(&test_stage_completed);
+ TEST_ASSERT(ret == 0, "Error in sem_destroy");
+
+ free(vcpu_threads);
+ ucall_uninit(vm);
+ kvm_vm_free(vm);
+}
+
+static void help(char *name)
+{
+ puts("");
+ printf("usage: %s [-h] [-p offset] [-m mode] "
+ "[-b mem-size] [-v vcpus] [-s mem-type]\n", name);
+ puts("");
+ printf(" -p: specify guest physical test memory offset\n"
+ " Warning: a low offset can conflict with the loaded test code.\n");
+ guest_modes_help();
+ printf(" -b: specify size of the memory region for testing. e.g. 10M or 3G.\n"
+ " (default: 1G)\n");
+ printf(" -v: specify the number of vCPUs to run\n"
+ " (default: 1)\n");
+ printf(" -s: specify the type of memory that should be used to\n"
+ " back the guest data region.\n"
+ " (default: anonymous)\n\n");
+ backing_src_help();
+ puts("");
+}
+
+int main(int argc, char *argv[])
+{
+ int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
+ struct test_params p = {
+ .test_mem_size = DEFAULT_TEST_MEM_SIZE,
+ .src_type = VM_MEM_SRC_ANONYMOUS,
+ };
+ int opt;
+
+ guest_modes_append_default();
+
+ while ((opt = getopt(argc, argv, "hp:m:b:v:s:")) != -1) {
+ switch (opt) {
+ case 'p':
+ p.phys_offset = strtoull(optarg, NULL, 0);
+ break;
+ case 'm':
+ guest_modes_cmdline(optarg);
+ break;
+ case 'b':
+ p.test_mem_size = parse_size(optarg);
+ break;
+ case 'v':
+ nr_vcpus = atoi(optarg);
+ TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus,
+ "Invalid number of vcpus, must be between 1 and %d", max_vcpus);
+ break;
+ case 's':
+ p.src_type = parse_backing_src_type(optarg);
+ break;
+ case 'h':
+ default:
+ help(argv[0]);
+ exit(0);
+ }
+ }
+
+ for_each_guest_mode(run_test, &p);
+
+ return 0;
+}
fprintf(stderr, "==== Test Assertion Failure ====\n"
" %s:%u: %s\n"
- " pid=%d tid=%d - %s\n",
+ " pid=%d tid=%d errno=%d - %s\n",
file, line, exp_str, getpid(), _gettid(),
- strerror(errno));
+ errno, strerror(errno));
test_dump_stack();
if (fmt) {
fputs(" ", stderr);
#include <unistd.h>
#include <linux/kernel.h>
-#define KVM_UTIL_PGS_PER_HUGEPG 512
#define KVM_UTIL_MIN_PFN 2
static int vcpu_mmap_sz(void);
"rc: %i errno: %i", vm->fd, errno);
}
-const char * const vm_guest_mode_string[] = {
- "PA-bits:52, VA-bits:48, 4K pages",
- "PA-bits:52, VA-bits:48, 64K pages",
- "PA-bits:48, VA-bits:48, 4K pages",
- "PA-bits:48, VA-bits:48, 64K pages",
- "PA-bits:40, VA-bits:48, 4K pages",
- "PA-bits:40, VA-bits:48, 64K pages",
- "PA-bits:ANY, VA-bits:48, 4K pages",
-};
-_Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES,
- "Missing new mode strings?");
+const char *vm_guest_mode_string(uint32_t i)
+{
+ static const char * const strings[] = {
+ [VM_MODE_P52V48_4K] = "PA-bits:52, VA-bits:48, 4K pages",
+ [VM_MODE_P52V48_64K] = "PA-bits:52, VA-bits:48, 64K pages",
+ [VM_MODE_P48V48_4K] = "PA-bits:48, VA-bits:48, 4K pages",
+ [VM_MODE_P48V48_64K] = "PA-bits:48, VA-bits:48, 64K pages",
+ [VM_MODE_P40V48_4K] = "PA-bits:40, VA-bits:48, 4K pages",
+ [VM_MODE_P40V48_64K] = "PA-bits:40, VA-bits:48, 64K pages",
+ [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48, 4K pages",
+ };
+ _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
+ "Missing new mode strings?");
+
+ TEST_ASSERT(i < NUM_VM_MODES, "Guest mode ID %d too big", i);
+
+ return strings[i];
+}
const struct vm_guest_mode_params vm_guest_mode_params[] = {
{ 52, 48, 0x1000, 12 },
{
int ret;
struct userspace_mem_region *region;
- size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size;
+ size_t backing_src_pagesz = get_backing_src_pagesz(src_type);
size_t alignment;
TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages,
#endif
if (src_type == VM_MEM_SRC_ANONYMOUS_THP)
- alignment = max(huge_page_size, alignment);
+ alignment = max(backing_src_pagesz, alignment);
/* Add enough memory to align up if necessary */
if (alignment > 1)
region->mmap_start = mmap(NULL, region->mmap_size,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS
- | (src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB ? MAP_HUGETLB : 0),
+ | vm_mem_backing_src_alias(src_type)->flag,
-1, 0);
TEST_ASSERT(region->mmap_start != MAP_FAILED,
"test_malloc failed, mmap_start: %p errno: %i",
region->host_mem = align(region->mmap_start, alignment);
/* As needed perform madvise */
- if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == VM_MEM_SRC_ANONYMOUS_THP) {
- struct stat statbuf;
-
- ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
- TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
- "stat /sys/kernel/mm/transparent_hugepage");
-
- TEST_ASSERT(ret == 0 || src_type != VM_MEM_SRC_ANONYMOUS_THP,
- "VM_MEM_SRC_ANONYMOUS_THP requires THP to be configured in the host kernel");
-
- if (ret == 0) {
- ret = madvise(region->host_mem, npages * vm->page_size,
- src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
- TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %x",
- region->host_mem, npages * vm->page_size, src_type);
- }
+ if ((src_type == VM_MEM_SRC_ANONYMOUS ||
+ src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) {
+ ret = madvise(region->host_mem, npages * vm->page_size,
+ src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
+ TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %s",
+ region->host_mem, npages * vm->page_size,
+ vm_mem_backing_src_alias(src_type)->name);
}
region->unused_phy_pages = sparsebit_alloc();
#include <limits.h>
#include <stdlib.h>
#include <time.h>
+#include <sys/stat.h>
+#include <linux/mman.h>
#include "linux/kernel.h"
#include "test_util.h"
puts(", skipping test");
}
-const struct vm_mem_backing_src_alias backing_src_aliases[] = {
- {"anonymous", VM_MEM_SRC_ANONYMOUS,},
- {"anonymous_thp", VM_MEM_SRC_ANONYMOUS_THP,},
- {"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,},
-};
+bool thp_configured(void)
+{
+ int ret;
+ struct stat statbuf;
+
+ ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
+ TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
+ "Error in stating /sys/kernel/mm/transparent_hugepage");
+
+ return ret == 0;
+}
+
+size_t get_trans_hugepagesz(void)
+{
+ size_t size;
+ FILE *f;
+
+ TEST_ASSERT(thp_configured(), "THP is not configured in host kernel");
+
+ f = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r");
+ TEST_ASSERT(f != NULL, "Error in opening transparent_hugepage/hpage_pmd_size");
+
+ fscanf(f, "%ld", &size);
+ fclose(f);
+
+ return size;
+}
+
+size_t get_def_hugetlb_pagesz(void)
+{
+ char buf[64];
+ const char *tag = "Hugepagesize:";
+ FILE *f;
+
+ f = fopen("/proc/meminfo", "r");
+ TEST_ASSERT(f != NULL, "Error in opening /proc/meminfo");
+
+ while (fgets(buf, sizeof(buf), f) != NULL) {
+ if (strstr(buf, tag) == buf) {
+ fclose(f);
+ return strtoull(buf + strlen(tag), NULL, 10) << 10;
+ }
+ }
+
+ if (feof(f))
+ TEST_FAIL("HUGETLB is not configured in host kernel");
+ else
+ TEST_FAIL("Error in reading /proc/meminfo");
+
+ fclose(f);
+ return 0;
+}
+
+const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i)
+{
+ static const struct vm_mem_backing_src_alias aliases[] = {
+ [VM_MEM_SRC_ANONYMOUS] = {
+ .name = "anonymous",
+ .flag = 0,
+ },
+ [VM_MEM_SRC_ANONYMOUS_THP] = {
+ .name = "anonymous_thp",
+ .flag = 0,
+ },
+ [VM_MEM_SRC_ANONYMOUS_HUGETLB] = {
+ .name = "anonymous_hugetlb",
+ .flag = MAP_HUGETLB,
+ },
+ [VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB] = {
+ .name = "anonymous_hugetlb_16kb",
+ .flag = MAP_HUGETLB | MAP_HUGE_16KB,
+ },
+ [VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB] = {
+ .name = "anonymous_hugetlb_64kb",
+ .flag = MAP_HUGETLB | MAP_HUGE_64KB,
+ },
+ [VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB] = {
+ .name = "anonymous_hugetlb_512kb",
+ .flag = MAP_HUGETLB | MAP_HUGE_512KB,
+ },
+ [VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB] = {
+ .name = "anonymous_hugetlb_1mb",
+ .flag = MAP_HUGETLB | MAP_HUGE_1MB,
+ },
+ [VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB] = {
+ .name = "anonymous_hugetlb_2mb",
+ .flag = MAP_HUGETLB | MAP_HUGE_2MB,
+ },
+ [VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB] = {
+ .name = "anonymous_hugetlb_8mb",
+ .flag = MAP_HUGETLB | MAP_HUGE_8MB,
+ },
+ [VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB] = {
+ .name = "anonymous_hugetlb_16mb",
+ .flag = MAP_HUGETLB | MAP_HUGE_16MB,
+ },
+ [VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB] = {
+ .name = "anonymous_hugetlb_32mb",
+ .flag = MAP_HUGETLB | MAP_HUGE_32MB,
+ },
+ [VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB] = {
+ .name = "anonymous_hugetlb_256mb",
+ .flag = MAP_HUGETLB | MAP_HUGE_256MB,
+ },
+ [VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB] = {
+ .name = "anonymous_hugetlb_512mb",
+ .flag = MAP_HUGETLB | MAP_HUGE_512MB,
+ },
+ [VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB] = {
+ .name = "anonymous_hugetlb_1gb",
+ .flag = MAP_HUGETLB | MAP_HUGE_1GB,
+ },
+ [VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB] = {
+ .name = "anonymous_hugetlb_2gb",
+ .flag = MAP_HUGETLB | MAP_HUGE_2GB,
+ },
+ [VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB] = {
+ .name = "anonymous_hugetlb_16gb",
+ .flag = MAP_HUGETLB | MAP_HUGE_16GB,
+ },
+ };
+ _Static_assert(ARRAY_SIZE(aliases) == NUM_SRC_TYPES,
+ "Missing new backing src types?");
+
+ TEST_ASSERT(i < NUM_SRC_TYPES, "Backing src type ID %d too big", i);
+
+ return &aliases[i];
+}
+
+#define MAP_HUGE_PAGE_SIZE(x) (1ULL << ((x >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK))
+
+size_t get_backing_src_pagesz(uint32_t i)
+{
+ uint32_t flag = vm_mem_backing_src_alias(i)->flag;
+
+ switch (i) {
+ case VM_MEM_SRC_ANONYMOUS:
+ return getpagesize();
+ case VM_MEM_SRC_ANONYMOUS_THP:
+ return get_trans_hugepagesz();
+ case VM_MEM_SRC_ANONYMOUS_HUGETLB:
+ return get_def_hugetlb_pagesz();
+ default:
+ return MAP_HUGE_PAGE_SIZE(flag);
+ }
+}
void backing_src_help(void)
{
int i;
printf("Available backing src types:\n");
- for (i = 0; i < ARRAY_SIZE(backing_src_aliases); i++)
- printf("\t%s\n", backing_src_aliases[i].name);
+ for (i = 0; i < NUM_SRC_TYPES; i++)
+ printf("\t%s\n", vm_mem_backing_src_alias(i)->name);
}
enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name)
{
int i;
- for (i = 0; i < ARRAY_SIZE(backing_src_aliases); i++)
- if (!strcmp(type_name, backing_src_aliases[i].name))
- return backing_src_aliases[i].type;
+ for (i = 0; i < NUM_SRC_TYPES; i++)
+ if (!strcmp(type_name, vm_mem_backing_src_alias(i)->name))
+ return i;
backing_src_help();
TEST_FAIL("Unknown backing src type: %s", type_name);
vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &st);
}
- struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);;
+ struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);
rs->state = 0x5a;
for (;;) {
#define __aligned(x) __attribute__((__aligned__(x)))
#define __packed __attribute__((packed))
-#include "../../../../arch/x86/kernel/cpu/sgx/arch.h"
+#include "../../../../arch/x86/include/asm/sgx.h"
#include "../../../../arch/x86/include/asm/enclu.h"
#include "../../../../arch/x86/include/uapi/asm/sgx.h"
fd = open(path, O_RDONLY);
if (fd == -1) {
- perror("open()");
+ perror("enclave executable open()");
return false;
}
ret = stat(path, &sb);
if (ret) {
- perror("stat()");
+ perror("enclave executable stat()");
goto err;
}
bin = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (bin == MAP_FAILED) {
- perror("mmap()");
+ perror("enclave executable mmap()");
goto err;
}
ioc.src = (unsigned long)secs;
rc = ioctl(encl->fd, SGX_IOC_ENCLAVE_CREATE, &ioc);
if (rc) {
- fprintf(stderr, "SGX_IOC_ENCLAVE_CREATE failed: errno=%d\n",
- errno);
+ perror("SGX_IOC_ENCLAVE_CREATE failed");
munmap((void *)secs->base, encl->encl_size);
return false;
}
rc = ioctl(encl->fd, SGX_IOC_ENCLAVE_ADD_PAGES, &ioc);
if (rc < 0) {
- fprintf(stderr, "SGX_IOC_ENCLAVE_ADD_PAGES failed: errno=%d.\n",
- errno);
+ perror("SGX_IOC_ENCLAVE_ADD_PAGES failed");
return false;
}
return true;
}
+
+
bool encl_load(const char *path, struct encl *encl)
{
+ const char device_path[] = "/dev/sgx_enclave";
Elf64_Phdr *phdr_tbl;
off_t src_offset;
Elf64_Ehdr *ehdr;
+ struct stat sb;
+ void *ptr;
int i, j;
int ret;
+ int fd = -1;
memset(encl, 0, sizeof(*encl));
- ret = open("/dev/sgx_enclave", O_RDWR);
- if (ret < 0) {
- fprintf(stderr, "Unable to open /dev/sgx_enclave\n");
+ fd = open(device_path, O_RDWR);
+ if (fd < 0) {
+ perror("Unable to open /dev/sgx_enclave");
+ goto err;
+ }
+
+ ret = stat(device_path, &sb);
+ if (ret) {
+ perror("device file stat()");
+ goto err;
+ }
+
+ /*
+ * This just checks if the /dev file has these permission
+ * bits set. It does not check that the current user is
+ * the owner or in the owning group.
+ */
+ if (!(sb.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) {
+ fprintf(stderr, "no execute permissions on device file %s\n", device_path);
+ goto err;
+ }
+
+ ptr = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED, fd, 0);
+ if (ptr == (void *)-1) {
+ perror("mmap for read");
+ goto err;
+ }
+ munmap(ptr, PAGE_SIZE);
+
+#define ERR_MSG \
+"mmap() succeeded for PROT_READ, but failed for PROT_EXEC.\n" \
+" Check that current user has execute permissions on %s and \n" \
+" that /dev does not have noexec set: mount | grep \"/dev .*noexec\"\n" \
+" If so, remount it executable: mount -o remount,exec /dev\n\n"
+
+ ptr = mmap(NULL, PAGE_SIZE, PROT_EXEC, MAP_SHARED, fd, 0);
+ if (ptr == (void *)-1) {
+ fprintf(stderr, ERR_MSG, device_path);
goto err;
}
+ munmap(ptr, PAGE_SIZE);
- encl->fd = ret;
+ encl->fd = fd;
if (!encl_map_bin(path, encl))
goto err;
return true;
err:
+ if (fd != -1)
+ close(fd);
encl_delete(encl);
return false;
}
area = mmap(NULL, encl_size * 2, PROT_NONE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (area == MAP_FAILED) {
- perror("mmap");
+ perror("reservation mmap()");
return false;
}
ioc.sigstruct = (uint64_t)&encl->sigstruct;
ret = ioctl(encl->fd, SGX_IOC_ENCLAVE_INIT, &ioc);
if (ret) {
- fprintf(stderr, "SGX_IOC_ENCLAVE_INIT failed: errno=%d\n",
- errno);
+ perror("SGX_IOC_ENCLAVE_INIT failed");
return false;
}
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
+#include <sys/auxv.h>
#include "defines.h"
#include "main.h"
#include "../kselftest.h"
Elf64_Word *elf_hashtab;
};
-static void *vdso_get_base_addr(char *envp[])
-{
- Elf64_auxv_t *auxv;
- int i;
-
- for (i = 0; envp[i]; i++)
- ;
-
- auxv = (Elf64_auxv_t *)&envp[i + 1];
-
- for (i = 0; auxv[i].a_type != AT_NULL; i++) {
- if (auxv[i].a_type == AT_SYSINFO_EHDR)
- return (void *)auxv[i].a_un.a_val;
- }
-
- return NULL;
-}
-
static Elf64_Dyn *vdso_get_dyntab(void *addr)
{
Elf64_Ehdr *ehdr = addr;
return 0;
}
-int main(int argc, char *argv[], char *envp[])
+int main(int argc, char *argv[])
{
struct sgx_enclave_run run;
struct vdso_symtab symtab;
addr = mmap((void *)encl.encl_base + seg->offset, seg->size,
seg->prot, MAP_SHARED | MAP_FIXED, encl.fd, 0);
if (addr == MAP_FAILED) {
- fprintf(stderr, "mmap() failed, errno=%d.\n", errno);
+ perror("mmap() segment failed");
exit(KSFT_FAIL);
}
}
memset(&run, 0, sizeof(run));
run.tcs = encl.encl_base;
- addr = vdso_get_base_addr(envp);
+ /* Get vDSO base address */
+ addr = (void *)getauxval(AT_SYSINFO_EHDR);
if (!addr)
goto err;
struct kvm_coalesced_mmio_zone *zone)
{
struct kvm_coalesced_mmio_dev *dev, *tmp;
+ int r;
if (zone->pio != 1 && zone->pio != 0)
return -EINVAL;
mutex_lock(&kvm->slots_lock);
- list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list)
+ list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list) {
if (zone->pio == dev->zone.pio &&
coalesced_mmio_in_range(dev, zone->addr, zone->size)) {
- kvm_io_bus_unregister_dev(kvm,
+ r = kvm_io_bus_unregister_dev(kvm,
zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS, &dev->dev);
kvm_iodevice_destructor(&dev->dev);
+
+ /*
+ * On failure, unregister destroys all devices on the
+ * bus _except_ the target device, i.e. coalesced_zones
+ * has been modified. No need to restart the walk as
+ * there aren't any zones left.
+ */
+ if (r)
+ break;
}
+ }
mutex_unlock(&kvm->slots_lock);
+ /*
+ * Ignore the result of kvm_io_bus_unregister_dev(), from userspace's
+ * perspective, the coalesced MMIO is most definitely unregistered.
+ */
return 0;
}
srcu_read_unlock(&kvm->srcu, idx);
}
-static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long address,
- pte_t pte)
+typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
+
+typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
+ unsigned long end);
+
+struct kvm_hva_range {
+ unsigned long start;
+ unsigned long end;
+ pte_t pte;
+ hva_handler_t handler;
+ on_lock_fn_t on_lock;
+ bool flush_on_ret;
+ bool may_block;
+};
+
+/*
+ * Use a dedicated stub instead of NULL to indicate that there is no callback
+ * function/handler. The compiler technically can't guarantee that a real
+ * function will have a non-zero address, and so it will generate code to
+ * check for !NULL, whereas comparing against a stub will be elided at compile
+ * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
+ */
+static void kvm_null_fn(void)
{
- struct kvm *kvm = mmu_notifier_to_kvm(mn);
- int idx;
+
+}
+#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
+
+static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
+ const struct kvm_hva_range *range)
+{
+ bool ret = false, locked = false;
+ struct kvm_gfn_range gfn_range;
+ struct kvm_memory_slot *slot;
+ struct kvm_memslots *slots;
+ int i, idx;
+
+ /* A null handler is allowed if and only if on_lock() is provided. */
+ if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
+ IS_KVM_NULL_FN(range->handler)))
+ return 0;
idx = srcu_read_lock(&kvm->srcu);
- KVM_MMU_LOCK(kvm);
+ /* The on_lock() path does not yet support lock elision. */
+ if (!IS_KVM_NULL_FN(range->on_lock)) {
+ locked = true;
+ KVM_MMU_LOCK(kvm);
- kvm->mmu_notifier_seq++;
+ range->on_lock(kvm, range->start, range->end);
+
+ if (IS_KVM_NULL_FN(range->handler))
+ goto out_unlock;
+ }
- if (kvm_set_spte_hva(kvm, address, pte))
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(slot, slots) {
+ unsigned long hva_start, hva_end;
+
+ hva_start = max(range->start, slot->userspace_addr);
+ hva_end = min(range->end, slot->userspace_addr +
+ (slot->npages << PAGE_SHIFT));
+ if (hva_start >= hva_end)
+ continue;
+
+ /*
+ * To optimize for the likely case where the address
+ * range is covered by zero or one memslots, don't
+ * bother making these conditional (to avoid writes on
+ * the second or later invocation of the handler).
+ */
+ gfn_range.pte = range->pte;
+ gfn_range.may_block = range->may_block;
+
+ /*
+ * {gfn(page) | page intersects with [hva_start, hva_end)} =
+ * {gfn_start, gfn_start+1, ..., gfn_end-1}.
+ */
+ gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
+ gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
+ gfn_range.slot = slot;
+
+ if (!locked) {
+ locked = true;
+ KVM_MMU_LOCK(kvm);
+ }
+ ret |= range->handler(kvm, &gfn_range);
+ }
+ }
+
+ if (range->flush_on_ret && (ret || kvm->tlbs_dirty))
kvm_flush_remote_tlbs(kvm);
- KVM_MMU_UNLOCK(kvm);
+out_unlock:
+ if (locked)
+ KVM_MMU_UNLOCK(kvm);
+
srcu_read_unlock(&kvm->srcu, idx);
+
+ /* The notifiers are averse to booleans. :-( */
+ return (int)ret;
}
-static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
- const struct mmu_notifier_range *range)
+static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
+ unsigned long start,
+ unsigned long end,
+ pte_t pte,
+ hva_handler_t handler)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
- int need_tlb_flush = 0, idx;
+ const struct kvm_hva_range range = {
+ .start = start,
+ .end = end,
+ .pte = pte,
+ .handler = handler,
+ .on_lock = (void *)kvm_null_fn,
+ .flush_on_ret = true,
+ .may_block = false,
+ };
+
+ return __kvm_handle_hva_range(kvm, &range);
+}
+
+static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
+ unsigned long start,
+ unsigned long end,
+ hva_handler_t handler)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ const struct kvm_hva_range range = {
+ .start = start,
+ .end = end,
+ .pte = __pte(0),
+ .handler = handler,
+ .on_lock = (void *)kvm_null_fn,
+ .flush_on_ret = false,
+ .may_block = false,
+ };
+
+ return __kvm_handle_hva_range(kvm, &range);
+}
+static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address,
+ pte_t pte)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+
+ trace_kvm_set_spte_hva(address);
- idx = srcu_read_lock(&kvm->srcu);
- KVM_MMU_LOCK(kvm);
+ /*
+ * .change_pte() must be surrounded by .invalidate_range_{start,end}(),
+ * and so always runs with an elevated notifier count. This obviates
+ * the need to bump the sequence count.
+ */
+ WARN_ON_ONCE(!kvm->mmu_notifier_count);
+
+ kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
+}
+
+static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
+ unsigned long end)
+{
/*
* The count increase must become visible at unlock time as no
* spte can be established without taking the mmu_lock and
*/
kvm->mmu_notifier_count++;
if (likely(kvm->mmu_notifier_count == 1)) {
- kvm->mmu_notifier_range_start = range->start;
- kvm->mmu_notifier_range_end = range->end;
+ kvm->mmu_notifier_range_start = start;
+ kvm->mmu_notifier_range_end = end;
} else {
/*
* Fully tracking multiple concurrent ranges has dimishing
* complete.
*/
kvm->mmu_notifier_range_start =
- min(kvm->mmu_notifier_range_start, range->start);
+ min(kvm->mmu_notifier_range_start, start);
kvm->mmu_notifier_range_end =
- max(kvm->mmu_notifier_range_end, range->end);
+ max(kvm->mmu_notifier_range_end, end);
}
- need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
- range->flags);
- /* we've to flush the tlb before the pages can be freed */
- if (need_tlb_flush || kvm->tlbs_dirty)
- kvm_flush_remote_tlbs(kvm);
-
- KVM_MMU_UNLOCK(kvm);
- srcu_read_unlock(&kvm->srcu, idx);
-
- return 0;
}
-static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
const struct mmu_notifier_range *range)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ const struct kvm_hva_range hva_range = {
+ .start = range->start,
+ .end = range->end,
+ .pte = __pte(0),
+ .handler = kvm_unmap_gfn_range,
+ .on_lock = kvm_inc_notifier_count,
+ .flush_on_ret = true,
+ .may_block = mmu_notifier_range_blockable(range),
+ };
- KVM_MMU_LOCK(kvm);
+ trace_kvm_unmap_hva_range(range->start, range->end);
+
+ __kvm_handle_hva_range(kvm, &hva_range);
+
+ return 0;
+}
+
+static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
+ unsigned long end)
+{
/*
* This sequence increase will notify the kvm page fault that
* the page that is going to be mapped in the spte could have
* in conjunction with the smp_rmb in mmu_notifier_retry().
*/
kvm->mmu_notifier_count--;
- KVM_MMU_UNLOCK(kvm);
+}
+
+static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+ const struct mmu_notifier_range *range)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ const struct kvm_hva_range hva_range = {
+ .start = range->start,
+ .end = range->end,
+ .pte = __pte(0),
+ .handler = (void *)kvm_null_fn,
+ .on_lock = kvm_dec_notifier_count,
+ .flush_on_ret = false,
+ .may_block = mmu_notifier_range_blockable(range),
+ };
+
+ __kvm_handle_hva_range(kvm, &hva_range);
BUG_ON(kvm->mmu_notifier_count < 0);
}
unsigned long start,
unsigned long end)
{
- struct kvm *kvm = mmu_notifier_to_kvm(mn);
- int young, idx;
+ trace_kvm_age_hva(start, end);
- idx = srcu_read_lock(&kvm->srcu);
- KVM_MMU_LOCK(kvm);
-
- young = kvm_age_hva(kvm, start, end);
- if (young)
- kvm_flush_remote_tlbs(kvm);
-
- KVM_MMU_UNLOCK(kvm);
- srcu_read_unlock(&kvm->srcu, idx);
-
- return young;
+ return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
}
static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
unsigned long start,
unsigned long end)
{
- struct kvm *kvm = mmu_notifier_to_kvm(mn);
- int young, idx;
+ trace_kvm_age_hva(start, end);
- idx = srcu_read_lock(&kvm->srcu);
- KVM_MMU_LOCK(kvm);
/*
* Even though we do not flush TLB, this will still adversely
* affect performance on pre-Haswell Intel EPT, where there is
* cadence. If we find this inaccurate, we might come up with a
* more sophisticated heuristic later.
*/
- young = kvm_age_hva(kvm, start, end);
- KVM_MMU_UNLOCK(kvm);
- srcu_read_unlock(&kvm->srcu, idx);
-
- return young;
+ return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
}
static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long address)
{
- struct kvm *kvm = mmu_notifier_to_kvm(mn);
- int young, idx;
+ trace_kvm_test_age_hva(address);
- idx = srcu_read_lock(&kvm->srcu);
- KVM_MMU_LOCK(kvm);
- young = kvm_test_age_hva(kvm, address);
- KVM_MMU_UNLOCK(kvm);
- srcu_read_unlock(&kvm->srcu, idx);
-
- return young;
+ return kvm_handle_hva_range_no_flush(mn, address, address + 1,
+ kvm_test_age_gfn);
}
static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
return false;
}
+bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+
void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
{
struct kvm *kvm = me->kvm;
!vcpu_dy_runnable(vcpu))
continue;
if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
- !kvm_arch_vcpu_in_kernel(vcpu))
+ !kvm_arch_dy_has_pending_interrupt(vcpu) &&
+ !kvm_arch_vcpu_in_kernel(vcpu))
continue;
if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
continue;
if (r)
goto vcpu_decrement;
- vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+ vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
if (!vcpu) {
r = -ENOMEM;
goto vcpu_decrement;
KVM_COMPAT(kvm_vm_compat_ioctl),
};
+bool file_is_kvm(struct file *file)
+{
+ return file && file->f_op == &kvm_vm_fops;
+}
+EXPORT_SYMBOL_GPL(file_is_kvm);
+
static int kvm_dev_ioctl_create_vm(unsigned long type)
{
int r;
return 0;
}
-/* Caller must hold slots_lock. */
-void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
- struct kvm_io_device *dev)
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ struct kvm_io_device *dev)
{
int i, j;
struct kvm_io_bus *new_bus, *bus;
+ lockdep_assert_held(&kvm->slots_lock);
+
bus = kvm_get_bus(kvm, bus_idx);
if (!bus)
- return;
+ return 0;
- for (i = 0; i < bus->dev_count; i++)
+ for (i = 0; i < bus->dev_count; i++) {
if (bus->range[i].dev == dev) {
break;
}
+ }
if (i == bus->dev_count)
- return;
+ return 0;
new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
GFP_KERNEL_ACCOUNT);
new_bus->dev_count--;
memcpy(new_bus->range + i, bus->range + i + 1,
flex_array_size(new_bus, range, new_bus->dev_count - i));
- } else {
+ }
+
+ rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+ synchronize_srcu_expedited(&kvm->srcu);
+
+ /* Destroy the old bus _after_ installing the (null) bus. */
+ if (!new_bus) {
pr_err("kvm: failed to shrink bus, removing it completely\n");
for (j = 0; j < bus->dev_count; j++) {
if (j == i)
}
}
- rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
- synchronize_srcu_expedited(&kvm->srcu);
kfree(bus);
- return;
+ return new_bus ? 0 : -ENOMEM;
}
struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,