Merge branch 'kvm-sev-cgroup' into HEAD
authorPaolo Bonzini <pbonzini@redhat.com>
Thu, 22 Apr 2021 06:39:48 +0000 (02:39 -0400)
committerPaolo Bonzini <pbonzini@redhat.com>
Thu, 22 Apr 2021 17:19:01 +0000 (13:19 -0400)
110 files changed:
Documentation/virt/kvm/amd-memory-encryption.rst
Documentation/virt/kvm/api.rst
Documentation/virt/kvm/locking.rst
Documentation/virt/kvm/s390-diag.rst
Documentation/x86/sgx.rst
MAINTAINERS
arch/arm64/include/asm/kvm_host.h
arch/arm64/kvm/arm.c
arch/arm64/kvm/guest.c
arch/arm64/kvm/mmu.c
arch/arm64/kvm/trace_arm.h
arch/mips/include/asm/kvm_host.h
arch/mips/kvm/mips.c
arch/mips/kvm/mmu.c
arch/mips/kvm/trap_emul.c
arch/mips/kvm/vz.c
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s.h
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_64_mmu_radix.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_pr.c
arch/powerpc/kvm/e500_mmu_host.c
arch/powerpc/kvm/trace_booke.h
arch/s390/include/asm/kvm_host.h
arch/s390/include/asm/smp.h
arch/s390/kernel/smp.c
arch/s390/kvm/diag.c
arch/s390/kvm/gaccess.c
arch/s390/kvm/gaccess.h
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/kvm-s390.h
arch/s390/kvm/vsie.c
arch/x86/Kconfig
arch/x86/include/asm/cpufeatures.h
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/sgx.h [new file with mode: 0644]
arch/x86/include/asm/svm.h
arch/x86/include/asm/vmx.h
arch/x86/include/uapi/asm/vmx.h
arch/x86/kernel/cpu/cpuid-deps.c
arch/x86/kernel/cpu/feat_ctl.c
arch/x86/kernel/cpu/scattered.c
arch/x86/kernel/cpu/sgx/Makefile
arch/x86/kernel/cpu/sgx/arch.h [deleted file]
arch/x86/kernel/cpu/sgx/driver.c
arch/x86/kernel/cpu/sgx/encl.c
arch/x86/kernel/cpu/sgx/encl.h
arch/x86/kernel/cpu/sgx/encls.h
arch/x86/kernel/cpu/sgx/ioctl.c
arch/x86/kernel/cpu/sgx/main.c
arch/x86/kernel/cpu/sgx/sgx.h
arch/x86/kernel/cpu/sgx/virt.c [new file with mode: 0644]
arch/x86/kernel/kvm.c
arch/x86/kvm/Kconfig
arch/x86/kvm/Makefile
arch/x86/kvm/cpuid.c
arch/x86/kvm/cpuid.h
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu.h
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/mmu/mmu_audit.c
arch/x86/kvm/mmu/mmu_internal.h
arch/x86/kvm/mmu/paging_tmpl.h
arch/x86/kvm/mmu/spte.c
arch/x86/kvm/mmu/spte.h
arch/x86/kvm/mmu/tdp_mmu.c
arch/x86/kvm/mmu/tdp_mmu.h
arch/x86/kvm/svm/avic.c
arch/x86/kvm/svm/nested.c
arch/x86/kvm/svm/sev.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/svm/svm.h
arch/x86/kvm/svm/vmenter.S
arch/x86/kvm/vmx/nested.c
arch/x86/kvm/vmx/nested.h
arch/x86/kvm/vmx/sgx.c [new file with mode: 0644]
arch/x86/kvm/vmx/sgx.h [new file with mode: 0644]
arch/x86/kvm/vmx/vmcs12.c
arch/x86/kvm/vmx/vmcs12.h
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/vmx/vmx.h
arch/x86/kvm/vmx/vmx_ops.h
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
drivers/crypto/ccp/sev-dev.c
drivers/crypto/ccp/sev-dev.h
include/linux/kvm_host.h
include/linux/psp-sev.h
include/trace/events/kvm.h
include/uapi/linux/kvm.h
tools/include/asm-generic/hugetlb_encode.h
tools/testing/selftests/kvm/.gitignore
tools/testing/selftests/kvm/Makefile
tools/testing/selftests/kvm/dirty_log_test.c
tools/testing/selftests/kvm/include/kvm_util.h
tools/testing/selftests/kvm/include/test_util.h
tools/testing/selftests/kvm/kvm_page_table_test.c [new file with mode: 0644]
tools/testing/selftests/kvm/lib/assert.c
tools/testing/selftests/kvm/lib/kvm_util.c
tools/testing/selftests/kvm/lib/test_util.c
tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
tools/testing/selftests/sgx/defines.h
tools/testing/selftests/sgx/load.c
tools/testing/selftests/sgx/main.c
virt/kvm/coalesced_mmio.c
virt/kvm/kvm_main.c

index 469a630..907adfe 100644 (file)
@@ -148,6 +148,9 @@ measurement. Since the guest owner knows the initial contents of the guest at
 boot, the measurement can be verified by comparing it to what the guest owner
 expects.
 
+If len is zero on entry, the measurement blob length is written to len and
+uaddr is unused.
+
 Parameters (in): struct  kvm_sev_launch_measure
 
 Returns: 0 on success, -negative on error
@@ -271,6 +274,9 @@ report containing the SHA-256 digest of the guest memory and VMSA passed through
 commands and signed with the PEK. The digest returned by the command should match the digest
 used by the guest owner with the KVM_SEV_LAUNCH_MEASURE.
 
+If len is zero on entry, the measurement blob length is written to len and
+uaddr is unused.
+
 Parameters (in): struct kvm_sev_attestation
 
 Returns: 0 on success, -negative on error
@@ -284,6 +290,142 @@ Returns: 0 on success, -negative on error
                 __u32 len;
         };
 
+11. KVM_SEV_SEND_START
+----------------------
+
+The KVM_SEV_SEND_START command can be used by the hypervisor to create an
+outgoing guest encryption context.
+
+If session_len is zero on entry, the length of the guest session information is
+written to session_len and all other fields are not used.
+
+Parameters (in): struct kvm_sev_send_start
+
+Returns: 0 on success, -negative on error
+
+::
+        struct kvm_sev_send_start {
+                __u32 policy;                 /* guest policy */
+
+                __u64 pdh_cert_uaddr;         /* platform Diffie-Hellman certificate */
+                __u32 pdh_cert_len;
+
+                __u64 plat_certs_uaddr;        /* platform certificate chain */
+                __u32 plat_certs_len;
+
+                __u64 amd_certs_uaddr;        /* AMD certificate */
+                __u32 amd_certs_len;
+
+                __u64 session_uaddr;          /* Guest session information */
+                __u32 session_len;
+        };
+
+12. KVM_SEV_SEND_UPDATE_DATA
+----------------------------
+
+The KVM_SEV_SEND_UPDATE_DATA command can be used by the hypervisor to encrypt the
+outgoing guest memory region with the encryption context creating using
+KVM_SEV_SEND_START.
+
+If hdr_len or trans_len are zero on entry, the length of the packet header and
+transport region are written to hdr_len and trans_len respectively, and all
+other fields are not used.
+
+Parameters (in): struct kvm_sev_send_update_data
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_launch_send_update_data {
+                __u64 hdr_uaddr;        /* userspace address containing the packet header */
+                __u32 hdr_len;
+
+                __u64 guest_uaddr;      /* the source memory region to be encrypted */
+                __u32 guest_len;
+
+                __u64 trans_uaddr;      /* the destination memory region  */
+                __u32 trans_len;
+        };
+
+13. KVM_SEV_SEND_FINISH
+------------------------
+
+After completion of the migration flow, the KVM_SEV_SEND_FINISH command can be
+issued by the hypervisor to delete the encryption context.
+
+Returns: 0 on success, -negative on error
+
+14. KVM_SEV_SEND_CANCEL
+------------------------
+
+After completion of SEND_START, but before SEND_FINISH, the source VMM can issue the
+SEND_CANCEL command to stop a migration. This is necessary so that a cancelled
+migration can restart with a new target later.
+
+Returns: 0 on success, -negative on error
+
+15. KVM_SEV_RECEIVE_START
+------------------------
+
+The KVM_SEV_RECEIVE_START command is used for creating the memory encryption
+context for an incoming SEV guest. To create the encryption context, the user must
+provide a guest policy, the platform public Diffie-Hellman (PDH) key and session
+information.
+
+Parameters: struct  kvm_sev_receive_start (in/out)
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_receive_start {
+                __u32 handle;           /* if zero then firmware creates a new handle */
+                __u32 policy;           /* guest's policy */
+
+                __u64 pdh_uaddr;        /* userspace address pointing to the PDH key */
+                __u32 pdh_len;
+
+                __u64 session_uaddr;    /* userspace address which points to the guest session information */
+                __u32 session_len;
+        };
+
+On success, the 'handle' field contains a new handle and on error, a negative value.
+
+For more details, see SEV spec Section 6.12.
+
+16. KVM_SEV_RECEIVE_UPDATE_DATA
+----------------------------
+
+The KVM_SEV_RECEIVE_UPDATE_DATA command can be used by the hypervisor to copy
+the incoming buffers into the guest memory region with encryption context
+created during the KVM_SEV_RECEIVE_START.
+
+Parameters (in): struct kvm_sev_receive_update_data
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_launch_receive_update_data {
+                __u64 hdr_uaddr;        /* userspace address containing the packet header */
+                __u32 hdr_len;
+
+                __u64 guest_uaddr;      /* the destination guest memory region */
+                __u32 guest_len;
+
+                __u64 trans_uaddr;      /* the incoming buffer memory region  */
+                __u32 trans_len;
+        };
+
+17. KVM_SEV_RECEIVE_FINISH
+------------------------
+
+After completion of the migration flow, the KVM_SEV_RECEIVE_FINISH command can be
+issued by the hypervisor to make the guest ready for execution.
+
+Returns: 0 on success, -negative on error
+
 References
 ==========
 
index 307f2fc..56c6fca 100644 (file)
@@ -204,7 +204,7 @@ Errors:
 
   ======     ============================================================
   EFAULT     the msr index list cannot be read from or written to
-  E2BIG      the msr index list is to be to fit in the array specified by
+  E2BIG      the msr index list is too big to fit in the array specified by
              the user.
   ======     ============================================================
 
@@ -3358,6 +3358,9 @@ indicating the number of supported registers.
 For ppc, the KVM_CAP_PPC_GUEST_DEBUG_SSTEP capability indicates whether
 the single-step debug event (KVM_GUESTDBG_SINGLESTEP) is supported.
 
+Also when supported, KVM_CAP_SET_GUEST_DEBUG2 capability indicates the
+supported KVM_GUESTDBG_* bits in the control field.
+
 When debug events exit the main run loop with the reason
 KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run
 structure containing architecture specific debug information.
@@ -3690,31 +3693,105 @@ which is the maximum number of possibly pending cpu-local interrupts.
 
 Queues an SMI on the thread's vcpu.
 
-4.97 KVM_CAP_PPC_MULTITCE
--------------------------
+4.97 KVM_X86_SET_MSR_FILTER
+----------------------------
 
-:Capability: KVM_CAP_PPC_MULTITCE
-:Architectures: ppc
-:Type: vm
+:Capability: KVM_X86_SET_MSR_FILTER
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_msr_filter
+:Returns: 0 on success, < 0 on error
 
-This capability means the kernel is capable of handling hypercalls
-H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
-space. This significantly accelerates DMA operations for PPC KVM guests.
-User space should expect that its handlers for these hypercalls
-are not going to be called if user space previously registered LIOBN
-in KVM (via KVM_CREATE_SPAPR_TCE or similar calls).
+::
 
-In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
-user space might have to advertise it for the guest. For example,
-IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is
-present in the "ibm,hypertas-functions" device-tree property.
+  struct kvm_msr_filter_range {
+  #define KVM_MSR_FILTER_READ  (1 << 0)
+  #define KVM_MSR_FILTER_WRITE (1 << 1)
+       __u32 flags;
+       __u32 nmsrs; /* number of msrs in bitmap */
+       __u32 base;  /* MSR index the bitmap starts at */
+       __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */
+  };
 
-The hypercalls mentioned above may or may not be processed successfully
-in the kernel based fast path. If they can not be handled by the kernel,
-they will get passed on to user space. So user space still has to have
-an implementation for these despite the in kernel acceleration.
+  #define KVM_MSR_FILTER_MAX_RANGES 16
+  struct kvm_msr_filter {
+  #define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0)
+  #define KVM_MSR_FILTER_DEFAULT_DENY  (1 << 0)
+       __u32 flags;
+       struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES];
+  };
 
-This capability is always enabled.
+flags values for ``struct kvm_msr_filter_range``:
+
+``KVM_MSR_FILTER_READ``
+
+  Filter read accesses to MSRs using the given bitmap. A 0 in the bitmap
+  indicates that a read should immediately fail, while a 1 indicates that
+  a read for a particular MSR should be handled regardless of the default
+  filter action.
+
+``KVM_MSR_FILTER_WRITE``
+
+  Filter write accesses to MSRs using the given bitmap. A 0 in the bitmap
+  indicates that a write should immediately fail, while a 1 indicates that
+  a write for a particular MSR should be handled regardless of the default
+  filter action.
+
+``KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE``
+
+  Filter both read and write accesses to MSRs using the given bitmap. A 0
+  in the bitmap indicates that both reads and writes should immediately fail,
+  while a 1 indicates that reads and writes for a particular MSR are not
+  filtered by this range.
+
+flags values for ``struct kvm_msr_filter``:
+
+``KVM_MSR_FILTER_DEFAULT_ALLOW``
+
+  If no filter range matches an MSR index that is getting accessed, KVM will
+  fall back to allowing access to the MSR.
+
+``KVM_MSR_FILTER_DEFAULT_DENY``
+
+  If no filter range matches an MSR index that is getting accessed, KVM will
+  fall back to rejecting access to the MSR. In this mode, all MSRs that should
+  be processed by KVM need to explicitly be marked as allowed in the bitmaps.
+
+This ioctl allows user space to define up to 16 bitmaps of MSR ranges to
+specify whether a certain MSR access should be explicitly filtered for or not.
+
+If this ioctl has never been invoked, MSR accesses are not guarded and the
+default KVM in-kernel emulation behavior is fully preserved.
+
+Calling this ioctl with an empty set of ranges (all nmsrs == 0) disables MSR
+filtering. In that mode, ``KVM_MSR_FILTER_DEFAULT_DENY`` is invalid and causes
+an error.
+
+As soon as the filtering is in place, every MSR access is processed through
+the filtering except for accesses to the x2APIC MSRs (from 0x800 to 0x8ff);
+x2APIC MSRs are always allowed, independent of the ``default_allow`` setting,
+and their behavior depends on the ``X2APIC_ENABLE`` bit of the APIC base
+register.
+
+If a bit is within one of the defined ranges, read and write accesses are
+guarded by the bitmap's value for the MSR index if the kind of access
+is included in the ``struct kvm_msr_filter_range`` flags.  If no range
+cover this particular access, the behavior is determined by the flags
+field in the kvm_msr_filter struct: ``KVM_MSR_FILTER_DEFAULT_ALLOW``
+and ``KVM_MSR_FILTER_DEFAULT_DENY``.
+
+Each bitmap range specifies a range of MSRs to potentially allow access on.
+The range goes from MSR index [base .. base+nmsrs]. The flags field
+indicates whether reads, writes or both reads and writes are filtered
+by setting a 1 bit in the bitmap for the corresponding MSR index.
+
+If an MSR access is not permitted through the filtering, it generates a
+#GP inside the guest. When combined with KVM_CAP_X86_USER_SPACE_MSR, that
+allows user space to deflect and potentially handle various MSR accesses
+into user space.
+
+If a vCPU is in running state while this ioctl is invoked, the vCPU may
+experience inconsistent filtering behavior on MSR accesses.
 
 4.98 KVM_CREATE_SPAPR_TCE_64
 ----------------------------
@@ -4855,7 +4932,7 @@ KVM_XEN_ATTR_TYPE_SHARED_INFO
 KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
   Sets the exception vector used to deliver Xen event channel upcalls.
 
-4.128 KVM_XEN_HVM_GET_ATTR
+4.127 KVM_XEN_HVM_GET_ATTR
 --------------------------
 
 :Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
@@ -4867,7 +4944,7 @@ KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
 Allows Xen VM attributes to be read. For the structure and types,
 see KVM_XEN_HVM_SET_ATTR above.
 
-4.129 KVM_XEN_VCPU_SET_ATTR
+4.128 KVM_XEN_VCPU_SET_ATTR
 ---------------------------
 
 :Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
@@ -4929,7 +5006,7 @@ KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST
   or RUNSTATE_offline) to set the current accounted state as of the
   adjusted state_entry_time.
 
-4.130 KVM_XEN_VCPU_GET_ATTR
+4.129 KVM_XEN_VCPU_GET_ATTR
 ---------------------------
 
 :Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
@@ -6233,6 +6310,45 @@ KVM_RUN_BUS_LOCK flag is used to distinguish between them.
 This capability can be used to check / enable 2nd DAWR feature provided
 by POWER10 processor.
 
+7.24 KVM_CAP_VM_COPY_ENC_CONTEXT_FROM
+-------------------------------------
+
+Architectures: x86 SEV enabled
+Type: vm
+Parameters: args[0] is the fd of the source vm
+Returns: 0 on success; ENOTTY on error
+
+This capability enables userspace to copy encryption context from the vm
+indicated by the fd to the vm this is called on.
+
+This is intended to support in-guest workloads scheduled by the host. This
+allows the in-guest workload to maintain its own NPTs and keeps the two vms
+from accidentally clobbering each other with interrupts and the like (separate
+APIC/MSRs/etc).
+
+7.25 KVM_CAP_SGX_ATTRIBUTE
+----------------------
+
+:Architectures: x86
+:Target: VM
+:Parameters: args[0] is a file handle of a SGX attribute file in securityfs
+:Returns: 0 on success, -EINVAL if the file handle is invalid or if a requested
+          attribute is not supported by KVM.
+
+KVM_CAP_SGX_ATTRIBUTE enables a userspace VMM to grant a VM access to one or
+more priveleged enclave attributes.  args[0] must hold a file handle to a valid
+SGX attribute file corresponding to an attribute that is supported/restricted
+by KVM (currently only PROVISIONKEY).
+
+The SGX subsystem restricts access to a subset of enclave attributes to provide
+additional security for an uncompromised kernel, e.g. use of the PROVISIONKEY
+is restricted to deter malware from using the PROVISIONKEY to obtain a stable
+system fingerprint.  To prevent userspace from circumventing such restrictions
+by running an enclave in a VM, KVM prevents access to privileged attributes by
+default.
+
+See Documentation/x86/sgx/2.Kernel-internals.rst for more details.
+
 8. Other capabilities.
 ======================
 
@@ -6727,3 +6843,29 @@ vcpu_info is set.
 The KVM_XEN_HVM_CONFIG_RUNSTATE flag indicates that the runstate-related
 features KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR/_CURRENT/_DATA/_ADJUST are
 supported by the KVM_XEN_VCPU_SET_ATTR/KVM_XEN_VCPU_GET_ATTR ioctls.
+
+8.31 KVM_CAP_PPC_MULTITCE
+-------------------------
+
+:Capability: KVM_CAP_PPC_MULTITCE
+:Architectures: ppc
+:Type: vm
+
+This capability means the kernel is capable of handling hypercalls
+H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
+space. This significantly accelerates DMA operations for PPC KVM guests.
+User space should expect that its handlers for these hypercalls
+are not going to be called if user space previously registered LIOBN
+in KVM (via KVM_CREATE_SPAPR_TCE or similar calls).
+
+In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
+user space might have to advertise it for the guest. For example,
+IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is
+present in the "ibm,hypertas-functions" device-tree property.
+
+The hypercalls mentioned above may or may not be processed successfully
+in the kernel based fast path. If they can not be handled by the kernel,
+they will get passed on to user space. So user space still has to have
+an implementation for these despite the in kernel acceleration.
+
+This capability is always enabled.
index 0aa4817..1fc860c 100644 (file)
@@ -38,25 +38,24 @@ the mmu-lock on x86. Currently, the page fault can be fast in one of the
 following two cases:
 
 1. Access Tracking: The SPTE is not present, but it is marked for access
-   tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to
-   restore the saved R/X bits. This is described in more detail later below.
+   tracking. That means we need to restore the saved R/X bits. This is
+   described in more detail later below.
 
-2. Write-Protection: The SPTE is present and the fault is
-   caused by write-protect. That means we just need to change the W bit of
-   the spte.
+2. Write-Protection: The SPTE is present and the fault is caused by
+   write-protect. That means we just need to change the W bit of the spte.
 
-What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and
-SPTE_MMU_WRITEABLE bit on the spte:
+What we use to avoid all the race is the Host-writable bit and MMU-writable bit
+on the spte:
 
-- SPTE_HOST_WRITEABLE means the gfn is writable on host.
-- SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when
-  the gfn is writable on guest mmu and it is not write-protected by shadow
-  page write-protection.
+- Host-writable means the gfn is writable in the host kernel page tables and in
+  its KVM memslot.
+- MMU-writable means the gfn is writable in the guest's mmu and it is not
+  write-protected by shadow page write-protection.
 
 On fast page fault path, we will use cmpxchg to atomically set the spte W
-bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or
-restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This
-is safe because whenever changing these bits can be detected by cmpxchg.
+bit if spte.HOST_WRITEABLE = 1 and spte.WRITE_PROTECT = 1, to restore the saved
+R/X bits if for an access-traced spte, or both. This is safe because whenever
+changing these bits can be detected by cmpxchg.
 
 But we need carefully check these cases:
 
@@ -185,17 +184,17 @@ See the comments in spte_has_volatile_bits() and mmu_spte_update().
 Lockless Access Tracking:
 
 This is used for Intel CPUs that are using EPT but do not support the EPT A/D
-bits. In this case, when the KVM MMU notifier is called to track accesses to a
-page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present
-by clearing the RWX bits in the PTE and storing the original R & X bits in
-some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the
-PTE (using the ignored bit 62). When the VM tries to access the page later on,
-a fault is generated and the fast page fault mechanism described above is used
-to atomically restore the PTE to a Present state. The W bit is not saved when
-the PTE is marked for access tracking and during restoration to the Present
-state, the W bit is set depending on whether or not it was a write access. If
-it wasn't, then the W bit will remain clear until a write access happens, at
-which time it will be set using the Dirty tracking mechanism described above.
+bits. In this case, PTEs are tagged as A/D disabled (using ignored bits), and
+when the KVM MMU notifier is called to track accesses to a page (via
+kvm_mmu_notifier_clear_flush_young), it marks the PTE not-present in hardware
+by clearing the RWX bits in the PTE and storing the original R & X bits in more
+unused/ignored bits. When the VM tries to access the page later on, a fault is
+generated and the fast page fault mechanism described above is used to
+atomically restore the PTE to a Present state. The W bit is not saved when the
+PTE is marked for access tracking and during restoration to the Present state,
+the W bit is set depending on whether or not it was a write access. If it
+wasn't, then the W bit will remain clear until a write access happens, at which
+time it will be set using the Dirty tracking mechanism described above.
 
 3. Reference
 ------------
index eaac486..ca85f03 100644 (file)
@@ -84,3 +84,36 @@ If the function code specifies 0x501, breakpoint functions may be performed.
 This function code is handled by userspace.
 
 This diagnose function code has no subfunctions and uses no parameters.
+
+
+DIAGNOSE function code 'X'9C - Voluntary Time Slice Yield
+---------------------------------------------------------
+
+General register 1 contains the target CPU address.
+
+In a guest of a hypervisor like LPAR, KVM or z/VM using shared host CPUs,
+DIAGNOSE with function code 0x9c may improve system performance by
+yielding the host CPU on which the guest CPU is running to be assigned
+to another guest CPU, preferably the logical CPU containing the specified
+target CPU.
+
+
+DIAG 'X'9C forwarding
++++++++++++++++++++++
+
+The guest may send a DIAGNOSE 0x9c in order to yield to a certain
+other vcpu. An example is a Linux guest that tries to yield to the vcpu
+that is currently holding a spinlock, but not running.
+
+However, on the host the real cpu backing the vcpu may itself not be
+running.
+Forwarding the DIAGNOSE 0x9c initially sent by the guest to yield to
+the backing cpu will hopefully cause that cpu, and thus subsequently
+the guest's vcpu, to be scheduled.
+
+
+diag9c_forwarding_hz
+    KVM kernel parameter allowing to specify the maximum number of DIAGNOSE
+    0x9c forwarding per second in the purpose of avoiding a DIAGNOSE 0x9c
+    forwarding storm.
+    A value of 0 turns the forwarding off.
index eaee136..dd0ac96 100644 (file)
@@ -209,3 +209,44 @@ An application may be loaded into a container enclave which is specially
 configured with a library OS and run-time which permits the application to run.
 The enclave run-time and library OS work together to execute the application
 when a thread enters the enclave.
+
+Impact of Potential Kernel SGX Bugs
+===================================
+
+EPC leaks
+---------
+
+When EPC page leaks happen, a WARNING like this is shown in dmesg:
+
+"EREMOVE returned ... and an EPC page was leaked.  SGX may become unusable..."
+
+This is effectively a kernel use-after-free of an EPC page, and due
+to the way SGX works, the bug is detected at freeing. Rather than
+adding the page back to the pool of available EPC pages, the kernel
+intentionally leaks the page to avoid additional errors in the future.
+
+When this happens, the kernel will likely soon leak more EPC pages, and
+SGX will likely become unusable because the memory available to SGX is
+limited. However, while this may be fatal to SGX, the rest of the kernel
+is unlikely to be impacted and should continue to work.
+
+As a result, when this happpens, user should stop running any new
+SGX workloads, (or just any new workloads), and migrate all valuable
+workloads. Although a machine reboot can recover all EPC memory, the bug
+should be reported to Linux developers.
+
+
+Virtual EPC
+===========
+
+The implementation has also a virtual EPC driver to support SGX enclaves
+in guests. Unlike the SGX driver, an EPC page allocated by the virtual
+EPC driver doesn't have a specific enclave associated with it. This is
+because KVM doesn't track how a guest uses EPC pages.
+
+As a result, the SGX core page reclaimer doesn't support reclaiming EPC
+pages allocated to KVM guests through the virtual EPC driver. If the
+user wants to deploy SGX applications both on the host and in guests
+on the same machine, the user should reserve enough EPC (by taking out
+total virtual EPC size of all SGX VMs from the physical EPC size) for
+host SGX applications so they can run with acceptable performance.
index c80ad73..0417ebf 100644 (file)
@@ -9273,6 +9273,7 @@ Q:        https://patchwork.kernel.org/project/intel-sgx/list/
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/sgx
 F:     Documentation/x86/sgx.rst
 F:     arch/x86/entry/vdso/vsgx.S
+F:     arch/x86/include/asm/sgx.h
 F:     arch/x86/include/uapi/asm/sgx.h
 F:     arch/x86/kernel/cpu/sgx/*
 F:     tools/testing/selftests/sgx/*
index 3d10e65..3a708be 100644 (file)
@@ -401,6 +401,10 @@ struct kvm_vcpu_arch {
 #define KVM_ARM64_PENDING_EXCEPTION    (1 << 8) /* Exception pending */
 #define KVM_ARM64_EXCEPT_MASK          (7 << 9) /* Target EL/MODE */
 
+#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \
+                                KVM_GUESTDBG_USE_SW_BP | \
+                                KVM_GUESTDBG_USE_HW | \
+                                KVM_GUESTDBG_SINGLESTEP)
 /*
  * When KVM_ARM64_PENDING_EXCEPTION is set, KVM_ARM64_EXCEPT_MASK can
  * take the following values:
@@ -582,11 +586,6 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
                              struct kvm_vcpu_events *events);
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm,
-                       unsigned long start, unsigned long end, unsigned flags);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 
 void kvm_arm_halt_guest(struct kvm *kvm);
 void kvm_arm_resume_guest(struct kvm *kvm);
index 7f06ba7..0d92a4e 100644 (file)
@@ -208,6 +208,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_VCPU_ATTRIBUTES:
                r = 1;
                break;
+       case KVM_CAP_SET_GUEST_DEBUG2:
+               return KVM_GUESTDBG_VALID_MASK;
        case KVM_CAP_ARM_SET_DEVICE_ADDR:
                r = 1;
                break;
@@ -1268,7 +1270,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 }
 
 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-                                       struct kvm_memory_slot *memslot)
+                                       const struct kvm_memory_slot *memslot)
 {
        kvm_flush_remote_tlbs(kvm);
 }
index 9bbd30e..6cb39ee 100644 (file)
@@ -888,11 +888,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
        return -EINVAL;
 }
 
-#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE |    \
-                           KVM_GUESTDBG_USE_SW_BP | \
-                           KVM_GUESTDBG_USE_HW | \
-                           KVM_GUESTDBG_SINGLESTEP)
-
 /**
  * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging
  * @kvm:       pointer to the KVM struct
index 8711894..3572823 100644 (file)
@@ -839,7 +839,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
         * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
         * the page we just got a reference to gets unmapped before we have a
         * chance to grab the mmu_lock, which ensure that if the page gets
-        * unmapped afterwards, the call to kvm_unmap_hva will take it away
+        * unmapped afterwards, the call to kvm_unmap_gfn will take it away
         * from us again properly. This smp_rmb() interacts with the smp_wmb()
         * in kvm_mmu_notifier_invalidate_<page|range_end>.
         */
@@ -1064,126 +1064,70 @@ out_unlock:
        return ret;
 }
 
-static int handle_hva_to_gpa(struct kvm *kvm,
-                            unsigned long start,
-                            unsigned long end,
-                            int (*handler)(struct kvm *kvm,
-                                           gpa_t gpa, u64 size,
-                                           void *data),
-                            void *data)
-{
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
-       int ret = 0;
-
-       slots = kvm_memslots(kvm);
-
-       /* we only care about the pages that the guest sees */
-       kvm_for_each_memslot(memslot, slots) {
-               unsigned long hva_start, hva_end;
-               gfn_t gpa;
-
-               hva_start = max(start, memslot->userspace_addr);
-               hva_end = min(end, memslot->userspace_addr +
-                                       (memslot->npages << PAGE_SHIFT));
-               if (hva_start >= hva_end)
-                       continue;
-
-               gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
-               ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
-       }
-
-       return ret;
-}
-
-static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-       unsigned flags = *(unsigned *)data;
-       bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
-
-       __unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block);
-       return 0;
-}
-
-int kvm_unmap_hva_range(struct kvm *kvm,
-                       unsigned long start, unsigned long end, unsigned flags)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        if (!kvm->arch.mmu.pgt)
                return 0;
 
-       trace_kvm_unmap_hva_range(start, end);
-       handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
-       return 0;
-}
-
-static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-       kvm_pfn_t *pfn = (kvm_pfn_t *)data;
-
-       WARN_ON(size != PAGE_SIZE);
+       __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
+                            (range->end - range->start) << PAGE_SHIFT,
+                            range->may_block);
 
-       /*
-        * The MMU notifiers will have unmapped a huge PMD before calling
-        * ->change_pte() (which in turn calls kvm_set_spte_hva()) and
-        * therefore we never need to clear out a huge PMD through this
-        * calling path and a memcache is not required.
-        */
-       kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
-                              __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
        return 0;
 }
 
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       unsigned long end = hva + PAGE_SIZE;
-       kvm_pfn_t pfn = pte_pfn(pte);
+       kvm_pfn_t pfn = pte_pfn(range->pte);
 
        if (!kvm->arch.mmu.pgt)
                return 0;
 
-       trace_kvm_set_spte_hva(hva);
+       WARN_ON(range->end - range->start != 1);
 
        /*
         * We've moved a page around, probably through CoW, so let's treat it
         * just like a translation fault and clean the cache to the PoC.
         */
        clean_dcache_guest_page(pfn, PAGE_SIZE);
-       handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
+
+       /*
+        * The MMU notifiers will have unmapped a huge PMD before calling
+        * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
+        * therefore we never need to clear out a huge PMD through this
+        * calling path and a memcache is not required.
+        */
+       kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
+                              PAGE_SIZE, __pfn_to_phys(pfn),
+                              KVM_PGTABLE_PROT_R, NULL);
+
        return 0;
 }
 
-static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       pte_t pte;
+       u64 size = (range->end - range->start) << PAGE_SHIFT;
        kvm_pte_t kpte;
+       pte_t pte;
+
+       if (!kvm->arch.mmu.pgt)
+               return 0;
 
        WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-       kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa);
+
+       kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
+                                       range->start << PAGE_SHIFT);
        pte = __pte(kpte);
        return pte_valid(pte) && pte_young(pte);
 }
 
-static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-       WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-       return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa);
-}
-
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        if (!kvm->arch.mmu.pgt)
                return 0;
-       trace_kvm_age_hva(start, end);
-       return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
-}
 
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
-       if (!kvm->arch.mmu.pgt)
-               return 0;
-       trace_kvm_test_age_hva(hva);
-       return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
-                                kvm_test_age_hva_handler, NULL);
+       return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
+                                          range->start << PAGE_SHIFT);
 }
 
 phys_addr_t kvm_mmu_get_httbr(void)
index ff04443..33e4e7d 100644 (file)
@@ -135,72 +135,6 @@ TRACE_EVENT(kvm_mmio_emulate,
                  __entry->vcpu_pc, __entry->instr, __entry->cpsr)
 );
 
-TRACE_EVENT(kvm_unmap_hva_range,
-       TP_PROTO(unsigned long start, unsigned long end),
-       TP_ARGS(start, end),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  start           )
-               __field(        unsigned long,  end             )
-       ),
-
-       TP_fast_assign(
-               __entry->start          = start;
-               __entry->end            = end;
-       ),
-
-       TP_printk("mmu notifier unmap range: %#016lx -- %#016lx",
-                 __entry->start, __entry->end)
-);
-
-TRACE_EVENT(kvm_set_spte_hva,
-       TP_PROTO(unsigned long hva),
-       TP_ARGS(hva),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  hva             )
-       ),
-
-       TP_fast_assign(
-               __entry->hva            = hva;
-       ),
-
-       TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva)
-);
-
-TRACE_EVENT(kvm_age_hva,
-       TP_PROTO(unsigned long start, unsigned long end),
-       TP_ARGS(start, end),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  start           )
-               __field(        unsigned long,  end             )
-       ),
-
-       TP_fast_assign(
-               __entry->start          = start;
-               __entry->end            = end;
-       ),
-
-       TP_printk("mmu notifier age hva: %#016lx -- %#016lx",
-                 __entry->start, __entry->end)
-);
-
-TRACE_EVENT(kvm_test_age_hva,
-       TP_PROTO(unsigned long hva),
-       TP_ARGS(hva),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  hva             )
-       ),
-
-       TP_fast_assign(
-               __entry->hva            = hva;
-       ),
-
-       TP_printk("mmu notifier test age hva: %#016lx", __entry->hva)
-);
-
 TRACE_EVENT(kvm_set_way_flush,
            TP_PROTO(unsigned long vcpu_pc, bool cache),
            TP_ARGS(vcpu_pc, cache),
index 3a5612e..d0944a7 100644 (file)
@@ -815,14 +815,7 @@ struct kvm_mips_callbacks {
        int (*vcpu_init)(struct kvm_vcpu *vcpu);
        void (*vcpu_uninit)(struct kvm_vcpu *vcpu);
        int (*vcpu_setup)(struct kvm_vcpu *vcpu);
-       void (*flush_shadow_all)(struct kvm *kvm);
-       /*
-        * Must take care of flushing any cached GPA PTEs (e.g. guest entries in
-        * VZ root TLB, or T&E GVA page tables and corresponding root TLB
-        * mappings).
-        */
-       void (*flush_shadow_memslot)(struct kvm *kvm,
-                                    const struct kvm_memory_slot *slot);
+       void (*prepare_flush_shadow)(struct kvm *kvm);
        gpa_t (*gva_to_gpa)(gva_t gva);
        void (*queue_timer_int)(struct kvm_vcpu *vcpu);
        void (*dequeue_timer_int)(struct kvm_vcpu *vcpu);
@@ -967,11 +960,6 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu,
                                                   bool write);
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm,
-                       unsigned long start, unsigned long end, unsigned flags);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 
 /* Emulation */
 int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
@@ -1154,4 +1142,7 @@ static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
+#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
+int kvm_arch_flush_remote_tlb(struct kvm *kvm);
+
 #endif /* __MIPS_KVM_HOST_H__ */
index 58a8812..4a22ba7 100644 (file)
@@ -204,9 +204,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
        /* Flush whole GPA */
        kvm_mips_flush_gpa_pt(kvm, 0, ~0);
-
-       /* Let implementation do the rest */
-       kvm_mips_callbacks->flush_shadow_all(kvm);
+       kvm_flush_remote_tlbs(kvm);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
@@ -221,8 +219,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
        /* Flush slot from GPA */
        kvm_mips_flush_gpa_pt(kvm, slot->base_gfn,
                              slot->base_gfn + slot->npages - 1);
-       /* Let implementation do the rest */
-       kvm_mips_callbacks->flush_shadow_memslot(kvm, slot);
+       kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
        spin_unlock(&kvm->mmu_lock);
 }
 
@@ -262,9 +259,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
                /* Write protect GPA page table entries */
                needs_flush = kvm_mips_mkclean_gpa_pt(kvm, new->base_gfn,
                                        new->base_gfn + new->npages - 1);
-               /* Let implementation do the rest */
                if (needs_flush)
-                       kvm_mips_callbacks->flush_shadow_memslot(kvm, new);
+                       kvm_arch_flush_remote_tlbs_memslot(kvm, new);
                spin_unlock(&kvm->mmu_lock);
        }
 }
@@ -996,11 +992,16 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 
 }
 
+int kvm_arch_flush_remote_tlb(struct kvm *kvm)
+{
+       kvm_mips_callbacks->prepare_flush_shadow(kvm);
+       return 1;
+}
+
 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-                                       struct kvm_memory_slot *memslot)
+                                       const struct kvm_memory_slot *memslot)
 {
-       /* Let implementation handle TLB/GVA invalidation */
-       kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot);
+       kvm_flush_remote_tlbs(kvm);
 }
 
 long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
index 3dabeda..8af002b 100644 (file)
@@ -439,85 +439,34 @@ static int kvm_mips_mkold_gpa_pt(struct kvm *kvm, gfn_t start_gfn,
                                  end_gfn << PAGE_SHIFT);
 }
 
-static int handle_hva_to_gpa(struct kvm *kvm,
-                            unsigned long start,
-                            unsigned long end,
-                            int (*handler)(struct kvm *kvm, gfn_t gfn,
-                                           gpa_t gfn_end,
-                                           struct kvm_memory_slot *memslot,
-                                           void *data),
-                            void *data)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
-       int ret = 0;
-
-       slots = kvm_memslots(kvm);
-
-       /* we only care about the pages that the guest sees */
-       kvm_for_each_memslot(memslot, slots) {
-               unsigned long hva_start, hva_end;
-               gfn_t gfn, gfn_end;
-
-               hva_start = max(start, memslot->userspace_addr);
-               hva_end = min(end, memslot->userspace_addr +
-                                       (memslot->npages << PAGE_SHIFT));
-               if (hva_start >= hva_end)
-                       continue;
-
-               /*
-                * {gfn(page) | page intersects with [hva_start, hva_end)} =
-                * {gfn_start, gfn_start+1, ..., gfn_end-1}.
-                */
-               gfn = hva_to_gfn_memslot(hva_start, memslot);
-               gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
-               ret |= handler(kvm, gfn, gfn_end, memslot, data);
-       }
-
-       return ret;
-}
-
-
-static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
-                                struct kvm_memory_slot *memslot, void *data)
-{
-       kvm_mips_flush_gpa_pt(kvm, gfn, gfn_end);
+       kvm_mips_flush_gpa_pt(kvm, range->start, range->end);
        return 1;
 }
 
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
-                       unsigned flags)
-{
-       handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
-
-       kvm_mips_callbacks->flush_shadow_all(kvm);
-       return 0;
-}
-
-static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
-                               struct kvm_memory_slot *memslot, void *data)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       gpa_t gpa = gfn << PAGE_SHIFT;
-       pte_t hva_pte = *(pte_t *)data;
+       gpa_t gpa = range->start << PAGE_SHIFT;
+       pte_t hva_pte = range->pte;
        pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
        pte_t old_pte;
 
        if (!gpa_pte)
-               return 0;
+               return false;
 
        /* Mapping may need adjusting depending on memslot flags */
        old_pte = *gpa_pte;
-       if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
+       if (range->slot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
                hva_pte = pte_mkclean(hva_pte);
-       else if (memslot->flags & KVM_MEM_READONLY)
+       else if (range->slot->flags & KVM_MEM_READONLY)
                hva_pte = pte_wrprotect(hva_pte);
 
        set_pte(gpa_pte, hva_pte);
 
        /* Replacing an absent or old page doesn't need flushes */
        if (!pte_present(old_pte) || !pte_young(old_pte))
-               return 0;
+               return false;
 
        /* Pages swapped, aged, moved, or cleaned require flushes */
        return !pte_present(hva_pte) ||
@@ -526,27 +475,14 @@ static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
               (pte_dirty(old_pte) && !pte_dirty(hva_pte));
 }
 
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
-{
-       unsigned long end = hva + PAGE_SIZE;
-       int ret;
-
-       ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte);
-       if (ret)
-               kvm_mips_callbacks->flush_shadow_all(kvm);
-       return 0;
-}
-
-static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
-                              struct kvm_memory_slot *memslot, void *data)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm_mips_mkold_gpa_pt(kvm, gfn, gfn_end);
+       return kvm_mips_mkold_gpa_pt(kvm, range->start, range->end);
 }
 
-static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
-                                   struct kvm_memory_slot *memslot, void *data)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       gpa_t gpa = gfn << PAGE_SHIFT;
+       gpa_t gpa = range->start << PAGE_SHIFT;
        pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
 
        if (!gpa_pte)
@@ -554,16 +490,6 @@ static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
        return pte_young(*gpa_pte);
 }
 
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
-{
-       return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
-}
-
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
-       return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
-}
-
 /**
  * _kvm_mips_map_page_fast() - Fast path GPA fault handler.
  * @vcpu:              VCPU pointer.
index 0788c00..5f2df49 100644 (file)
@@ -687,16 +687,8 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-static void kvm_trap_emul_flush_shadow_all(struct kvm *kvm)
+static void kvm_trap_emul_prepare_flush_shadow(struct kvm *kvm)
 {
-       /* Flush GVA page tables and invalidate GVA ASIDs on all VCPUs */
-       kvm_flush_remote_tlbs(kvm);
-}
-
-static void kvm_trap_emul_flush_shadow_memslot(struct kvm *kvm,
-                                       const struct kvm_memory_slot *slot)
-{
-       kvm_trap_emul_flush_shadow_all(kvm);
 }
 
 static u64 kvm_trap_emul_get_one_regs[] = {
@@ -1280,8 +1272,7 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
        .vcpu_init = kvm_trap_emul_vcpu_init,
        .vcpu_uninit = kvm_trap_emul_vcpu_uninit,
        .vcpu_setup = kvm_trap_emul_vcpu_setup,
-       .flush_shadow_all = kvm_trap_emul_flush_shadow_all,
-       .flush_shadow_memslot = kvm_trap_emul_flush_shadow_memslot,
+       .prepare_flush_shadow = kvm_trap_emul_prepare_flush_shadow,
        .gva_to_gpa = kvm_trap_emul_gva_to_gpa_cb,
        .queue_timer_int = kvm_mips_queue_timer_int_cb,
        .dequeue_timer_int = kvm_mips_dequeue_timer_int_cb,
index 2ffbe92..2c75571 100644 (file)
@@ -3211,32 +3211,22 @@ static int kvm_vz_vcpu_setup(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-static void kvm_vz_flush_shadow_all(struct kvm *kvm)
+static void kvm_vz_prepare_flush_shadow(struct kvm *kvm)
 {
-       if (cpu_has_guestid) {
-               /* Flush GuestID for each VCPU individually */
-               kvm_flush_remote_tlbs(kvm);
-       } else {
+       if (!cpu_has_guestid) {
                /*
                 * For each CPU there is a single GPA ASID used by all VCPUs in
                 * the VM, so it doesn't make sense for the VCPUs to handle
                 * invalidation of these ASIDs individually.
                 *
                 * Instead mark all CPUs as needing ASID invalidation in
-                * asid_flush_mask, and just use kvm_flush_remote_tlbs(kvm) to
+                * asid_flush_mask, and kvm_flush_remote_tlbs(kvm) will
                 * kick any running VCPUs so they check asid_flush_mask.
                 */
                cpumask_setall(&kvm->arch.asid_flush_mask);
-               kvm_flush_remote_tlbs(kvm);
        }
 }
 
-static void kvm_vz_flush_shadow_memslot(struct kvm *kvm,
-                                       const struct kvm_memory_slot *slot)
-{
-       kvm_vz_flush_shadow_all(kvm);
-}
-
 static void kvm_vz_vcpu_reenter(struct kvm_vcpu *vcpu)
 {
        int cpu = smp_processor_id();
@@ -3292,8 +3282,7 @@ static struct kvm_mips_callbacks kvm_vz_callbacks = {
        .vcpu_init = kvm_vz_vcpu_init,
        .vcpu_uninit = kvm_vz_vcpu_uninit,
        .vcpu_setup = kvm_vz_vcpu_setup,
-       .flush_shadow_all = kvm_vz_flush_shadow_all,
-       .flush_shadow_memslot = kvm_vz_flush_shadow_memslot,
+       .prepare_flush_shadow = kvm_vz_prepare_flush_shadow,
        .gva_to_gpa = kvm_vz_gva_to_gpa_cb,
        .queue_timer_int = kvm_vz_queue_timer_int_cb,
        .dequeue_timer_int = kvm_vz_dequeue_timer_int_cb,
index 2f5f919..2d03f29 100644 (file)
@@ -210,12 +210,12 @@ extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd,
                                      unsigned int lpid);
 extern int kvmppc_radix_init(void);
 extern void kvmppc_radix_exit(void);
-extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                       unsigned long gfn);
-extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                       unsigned long gfn);
-extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                       unsigned long gfn);
+extern bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                           unsigned long gfn);
+extern bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                         unsigned long gfn);
+extern bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                              unsigned long gfn);
 extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
                        struct kvm_memory_slot *memslot, unsigned long *map);
 extern void kvmppc_radix_flush_memslot(struct kvm *kvm,
index 05fb00d..1e83359 100644 (file)
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 
-extern int kvm_unmap_hva_range(struct kvm *kvm,
-                              unsigned long start, unsigned long end,
-                              unsigned flags);
-extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
-extern int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-
 #define HPTEG_CACHE_NUM                        (1 << 15)
 #define HPTEG_HASH_BITS_PTE            13
 #define HPTEG_HASH_BITS_PTE_LONG       12
index 8aacd76..21ab033 100644 (file)
@@ -281,11 +281,10 @@ struct kvmppc_ops {
                                     const struct kvm_memory_slot *old,
                                     const struct kvm_memory_slot *new,
                                     enum kvm_mr_change change);
-       int (*unmap_hva_range)(struct kvm *kvm, unsigned long start,
-                          unsigned long end);
-       int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end);
-       int (*test_age_hva)(struct kvm *kvm, unsigned long hva);
-       void (*set_spte_hva)(struct kvm *kvm, unsigned long hva, pte_t pte);
+       bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range);
+       bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
+       bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
+       bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
        void (*free_memslot)(struct kvm_memory_slot *slot);
        int (*init_vm)(struct kvm *kvm);
        void (*destroy_vm)(struct kvm *kvm);
index 44bf567..2b691f4 100644 (file)
@@ -834,26 +834,24 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
        kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new, change);
 }
 
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
-                       unsigned flags)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end);
+       return kvm->arch.kvm_ops->unmap_gfn_range(kvm, range);
 }
 
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm->arch.kvm_ops->age_hva(kvm, start, end);
+       return kvm->arch.kvm_ops->age_gfn(kvm, range);
 }
 
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm->arch.kvm_ops->test_age_hva(kvm, hva);
+       return kvm->arch.kvm_ops->test_age_gfn(kvm, range);
 }
 
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte);
-       return 0;
+       return kvm->arch.kvm_ops->set_spte_gfn(kvm, range);
 }
 
 int kvmppc_core_init_vm(struct kvm *kvm)
index 9b6323e..740e51d 100644 (file)
@@ -9,12 +9,10 @@
 
 extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
                                         struct kvm_memory_slot *memslot);
-extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start,
-                                 unsigned long end);
-extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long start,
-                         unsigned long end);
-extern int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva);
-extern void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte);
+extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
 
 extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu);
index bb67735..b7bd9ca 100644 (file)
@@ -752,51 +752,6 @@ void kvmppc_rmap_reset(struct kvm *kvm)
        srcu_read_unlock(&kvm->srcu, srcu_idx);
 }
 
-typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                             unsigned long gfn);
-
-static int kvm_handle_hva_range(struct kvm *kvm,
-                               unsigned long start,
-                               unsigned long end,
-                               hva_handler_fn handler)
-{
-       int ret;
-       int retval = 0;
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
-
-       slots = kvm_memslots(kvm);
-       kvm_for_each_memslot(memslot, slots) {
-               unsigned long hva_start, hva_end;
-               gfn_t gfn, gfn_end;
-
-               hva_start = max(start, memslot->userspace_addr);
-               hva_end = min(end, memslot->userspace_addr +
-                                       (memslot->npages << PAGE_SHIFT));
-               if (hva_start >= hva_end)
-                       continue;
-               /*
-                * {gfn(page) | page intersects with [hva_start, hva_end)} =
-                * {gfn, gfn+1, ..., gfn_end-1}.
-                */
-               gfn = hva_to_gfn_memslot(hva_start, memslot);
-               gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
-               for (; gfn < gfn_end; ++gfn) {
-                       ret = handler(kvm, memslot, gfn);
-                       retval |= ret;
-               }
-       }
-
-       return retval;
-}
-
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
-                         hva_handler_fn handler)
-{
-       return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
-}
-
 /* Must be called with both HPTE and rmap locked */
 static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
                              struct kvm_memory_slot *memslot,
@@ -840,8 +795,8 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
        }
 }
 
-static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                          unsigned long gfn)
+static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                           unsigned long gfn)
 {
        unsigned long i;
        __be64 *hptep;
@@ -874,16 +829,15 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
                unlock_rmap(rmapp);
                __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
        }
-       return 0;
+       return false;
 }
 
-int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       hva_handler_fn handler;
+       if (kvm_is_radix(kvm))
+               return kvm_unmap_radix(kvm, range->slot, range->start);
 
-       handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
-       kvm_handle_hva_range(kvm, start, end, handler);
-       return 0;
+       return kvm_unmap_rmapp(kvm, range->slot, range->start);
 }
 
 void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
@@ -913,8 +867,8 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
        }
 }
 
-static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                        unsigned long gfn)
+static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                         unsigned long gfn)
 {
        struct revmap_entry *rev = kvm->arch.hpt.rev;
        unsigned long head, i, j;
@@ -968,26 +922,26 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
        return ret;
 }
 
-int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       hva_handler_fn handler;
+       if (kvm_is_radix(kvm))
+               kvm_age_radix(kvm, range->slot, range->start);
 
-       handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp;
-       return kvm_handle_hva_range(kvm, start, end, handler);
+       return kvm_age_rmapp(kvm, range->slot, range->start);
 }
 
-static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                             unsigned long gfn)
+static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                              unsigned long gfn)
 {
        struct revmap_entry *rev = kvm->arch.hpt.rev;
        unsigned long head, i, j;
        unsigned long *hp;
-       int ret = 1;
+       bool ret = true;
        unsigned long *rmapp;
 
        rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
        if (*rmapp & KVMPPC_RMAP_REFERENCED)
-               return 1;
+               return true;
 
        lock_rmap(rmapp);
        if (*rmapp & KVMPPC_RMAP_REFERENCED)
@@ -1002,27 +956,27 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
                                goto out;
                } while ((i = j) != head);
        }
-       ret = 0;
+       ret = false;
 
  out:
        unlock_rmap(rmapp);
        return ret;
 }
 
-int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       hva_handler_fn handler;
+       if (kvm_is_radix(kvm))
+               kvm_test_age_radix(kvm, range->slot, range->start);
 
-       handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp;
-       return kvm_handle_hva(kvm, hva, handler);
+       return kvm_test_age_rmapp(kvm, range->slot, range->start);
 }
 
-void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       hva_handler_fn handler;
+       if (kvm_is_radix(kvm))
+               return kvm_unmap_radix(kvm, range->slot, range->start);
 
-       handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
-       kvm_handle_hva(kvm, hva, handler);
+       return kvm_unmap_rmapp(kvm, range->slot, range->start);
 }
 
 static int vcpus_running(struct kvm *kvm)
index e603de7..ec4f58f 100644 (file)
@@ -993,8 +993,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,
 }
 
 /* Called with kvm->mmu_lock held */
-int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                   unsigned long gfn)
+bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                    unsigned long gfn)
 {
        pte_t *ptep;
        unsigned long gpa = gfn << PAGE_SHIFT;
@@ -1002,24 +1002,24 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 
        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {
                uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);
-               return 0;
+               return false;
        }
 
        ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
        if (ptep && pte_present(*ptep))
                kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
                                 kvm->arch.lpid);
-       return 0;
+       return false;
 }
 
 /* Called with kvm->mmu_lock held */
-int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                 unsigned long gfn)
+bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                  unsigned long gfn)
 {
        pte_t *ptep;
        unsigned long gpa = gfn << PAGE_SHIFT;
        unsigned int shift;
-       int ref = 0;
+       bool ref = false;
        unsigned long old, *rmapp;
 
        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
@@ -1035,26 +1035,27 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
                kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
                                               old & PTE_RPN_MASK,
                                               1UL << shift);
-               ref = 1;
+               ref = true;
        }
        return ref;
 }
 
 /* Called with kvm->mmu_lock held */
-int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                      unsigned long gfn)
+bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                       unsigned long gfn)
+
 {
        pte_t *ptep;
        unsigned long gpa = gfn << PAGE_SHIFT;
        unsigned int shift;
-       int ref = 0;
+       bool ref = false;
 
        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
                return ref;
 
        ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
        if (ptep && pte_present(*ptep) && pte_young(*ptep))
-               ref = 1;
+               ref = true;
        return ref;
 }
 
index 13bad6b..07682ad 100644 (file)
@@ -4770,7 +4770,7 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
                kvmhv_release_all_nested(kvm);
        kvmppc_rmap_reset(kvm);
        kvm->arch.process_table = 0;
-       /* Mutual exclusion with kvm_unmap_hva_range etc. */
+       /* Mutual exclusion with kvm_unmap_gfn_range etc. */
        spin_lock(&kvm->mmu_lock);
        kvm->arch.radix = 0;
        spin_unlock(&kvm->mmu_lock);
@@ -4792,7 +4792,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
        if (err)
                return err;
        kvmppc_rmap_reset(kvm);
-       /* Mutual exclusion with kvm_unmap_hva_range etc. */
+       /* Mutual exclusion with kvm_unmap_gfn_range etc. */
        spin_lock(&kvm->mmu_lock);
        kvm->arch.radix = 1;
        spin_unlock(&kvm->mmu_lock);
@@ -5654,10 +5654,10 @@ static struct kvmppc_ops kvm_ops_hv = {
        .flush_memslot  = kvmppc_core_flush_memslot_hv,
        .prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
        .commit_memory_region  = kvmppc_core_commit_memory_region_hv,
-       .unmap_hva_range = kvm_unmap_hva_range_hv,
-       .age_hva  = kvm_age_hva_hv,
-       .test_age_hva = kvm_test_age_hva_hv,
-       .set_spte_hva = kvm_set_spte_hva_hv,
+       .unmap_gfn_range = kvm_unmap_gfn_range_hv,
+       .age_gfn = kvm_age_gfn_hv,
+       .test_age_gfn = kvm_test_age_gfn_hv,
+       .set_spte_gfn = kvm_set_spte_gfn_hv,
        .free_memslot = kvmppc_core_free_memslot_hv,
        .init_vm =  kvmppc_core_init_vm_hv,
        .destroy_vm = kvmppc_core_destroy_vm_hv,
index 913944d..d7733b0 100644 (file)
@@ -425,61 +425,39 @@ static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu)
 }
 
 /************* MMU Notifiers *************/
-static void do_kvm_unmap_hva(struct kvm *kvm, unsigned long start,
-                            unsigned long end)
+static bool do_kvm_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        long i;
        struct kvm_vcpu *vcpu;
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
 
-       slots = kvm_memslots(kvm);
-       kvm_for_each_memslot(memslot, slots) {
-               unsigned long hva_start, hva_end;
-               gfn_t gfn, gfn_end;
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvmppc_mmu_pte_pflush(vcpu, range->start << PAGE_SHIFT,
+                                     range->end << PAGE_SHIFT);
 
-               hva_start = max(start, memslot->userspace_addr);
-               hva_end = min(end, memslot->userspace_addr +
-                                       (memslot->npages << PAGE_SHIFT));
-               if (hva_start >= hva_end)
-                       continue;
-               /*
-                * {gfn(page) | page intersects with [hva_start, hva_end)} =
-                * {gfn, gfn+1, ..., gfn_end-1}.
-                */
-               gfn = hva_to_gfn_memslot(hva_start, memslot);
-               gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-               kvm_for_each_vcpu(i, vcpu, kvm)
-                       kvmppc_mmu_pte_pflush(vcpu, gfn << PAGE_SHIFT,
-                                             gfn_end << PAGE_SHIFT);
-       }
+       return false;
 }
 
-static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start,
-                                 unsigned long end)
+static bool kvm_unmap_gfn_range_pr(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       do_kvm_unmap_hva(kvm, start, end);
-
-       return 0;
+       return do_kvm_unmap_gfn(kvm, range);
 }
 
-static int kvm_age_hva_pr(struct kvm *kvm, unsigned long start,
-                         unsigned long end)
+static bool kvm_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        /* XXX could be more clever ;) */
-       return 0;
+       return false;
 }
 
-static int kvm_test_age_hva_pr(struct kvm *kvm, unsigned long hva)
+static bool kvm_test_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        /* XXX could be more clever ;) */
-       return 0;
+       return false;
 }
 
-static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned long hva, pte_t pte)
+static bool kvm_set_spte_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        /* The page will get remapped properly on its next fault */
-       do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE);
+       return do_kvm_unmap_gfn(kvm, range);
 }
 
 /*****************************************/
@@ -2079,10 +2057,10 @@ static struct kvmppc_ops kvm_ops_pr = {
        .flush_memslot = kvmppc_core_flush_memslot_pr,
        .prepare_memory_region = kvmppc_core_prepare_memory_region_pr,
        .commit_memory_region = kvmppc_core_commit_memory_region_pr,
-       .unmap_hva_range = kvm_unmap_hva_range_pr,
-       .age_hva  = kvm_age_hva_pr,
-       .test_age_hva = kvm_test_age_hva_pr,
-       .set_spte_hva = kvm_set_spte_hva_pr,
+       .unmap_gfn_range = kvm_unmap_gfn_range_pr,
+       .age_gfn  = kvm_age_gfn_pr,
+       .test_age_gfn = kvm_test_age_gfn_pr,
+       .set_spte_gfn = kvm_set_spte_gfn_pr,
        .free_memslot = kvmppc_core_free_memslot_pr,
        .init_vm = kvmppc_core_init_vm_pr,
        .destroy_vm = kvmppc_core_destroy_vm_pr,
index ed0c9c4..7f16afc 100644 (file)
@@ -721,45 +721,36 @@ int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
 
 /************* MMU Notifiers *************/
 
-static int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+static bool kvm_e500_mmu_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       trace_kvm_unmap_hva(hva);
-
        /*
         * Flush all shadow tlb entries everywhere. This is slow, but
         * we are 100% sure that we catch the to be unmapped page
         */
-       kvm_flush_remote_tlbs(kvm);
-
-       return 0;
+       return true;
 }
 
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
-                       unsigned flags)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       /* kvm_unmap_hva flushes everything anyways */
-       kvm_unmap_hva(kvm, start);
-
-       return 0;
+       return kvm_e500_mmu_unmap_gfn(kvm, range);
 }
 
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        /* XXX could be more clever ;) */
-       return 0;
+       return false;
 }
 
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        /* XXX could be more clever ;) */
-       return 0;
+       return false;
 }
 
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        /* The page will get remapped properly on its next fault */
-       kvm_unmap_hva(kvm, hva);
-       return 0;
+       return kvm_e500_mmu_unmap_gfn(kvm, range);
 }
 
 /*****************************************/
index 3837842..eff6e82 100644 (file)
@@ -69,21 +69,6 @@ TRACE_EVENT(kvm_exit,
                )
 );
 
-TRACE_EVENT(kvm_unmap_hva,
-       TP_PROTO(unsigned long hva),
-       TP_ARGS(hva),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  hva             )
-       ),
-
-       TP_fast_assign(
-               __entry->hva            = hva;
-       ),
-
-       TP_printk("unmap hva 0x%lx\n", __entry->hva)
-);
-
 TRACE_EVENT(kvm_booke206_stlb_write,
        TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3),
        TP_ARGS(mas0, mas8, mas1, mas2, mas7_3),
index 6bcfc56..8925f39 100644 (file)
@@ -454,6 +454,7 @@ struct kvm_vcpu_stat {
        u64 diagnose_44;
        u64 diagnose_9c;
        u64 diagnose_9c_ignored;
+       u64 diagnose_9c_forward;
        u64 diagnose_258;
        u64 diagnose_308;
        u64 diagnose_500;
@@ -700,6 +701,10 @@ struct kvm_hw_bp_info_arch {
 #define guestdbg_exit_pending(vcpu) (guestdbg_enabled(vcpu) && \
                (vcpu->guest_debug & KVM_GUESTDBG_EXIT_PENDING))
 
+#define KVM_GUESTDBG_VALID_MASK \
+               (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP |\
+               KVM_GUESTDBG_USE_HW_BP | KVM_GUESTDBG_EXIT_PENDING)
+
 struct kvm_guestdbg_info_arch {
        unsigned long cr0;
        unsigned long cr9;
index 01e3600..e317fd4 100644 (file)
@@ -63,5 +63,6 @@ extern void __noreturn cpu_die(void);
 extern void __cpu_die(unsigned int cpu);
 extern int __cpu_disable(void);
 extern void schedule_mcck_handler(void);
+void notrace smp_yield_cpu(int cpu);
 
 #endif /* __ASM_SMP_H */
index 58c8afa..2fec2b8 100644 (file)
@@ -429,6 +429,7 @@ void notrace smp_yield_cpu(int cpu)
        asm volatile("diag %0,0,0x9c"
                     : : "d" (pcpu_devices[cpu].address));
 }
+EXPORT_SYMBOL_GPL(smp_yield_cpu);
 
 /*
  * Send cpus emergency shutdown signal. This gives the cpus the
index 5b8ec1c..02c146f 100644 (file)
@@ -150,6 +150,19 @@ static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static int forward_cnt;
+static unsigned long cur_slice;
+
+static int diag9c_forwarding_overrun(void)
+{
+       /* Reset the count on a new slice */
+       if (time_after(jiffies, cur_slice)) {
+               cur_slice = jiffies;
+               forward_cnt = diag9c_forwarding_hz / HZ;
+       }
+       return forward_cnt-- <= 0 ? 1 : 0;
+}
+
 static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
 {
        struct kvm_vcpu *tcpu;
@@ -167,9 +180,21 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
        if (!tcpu)
                goto no_yield;
 
-       /* target already running */
-       if (READ_ONCE(tcpu->cpu) >= 0)
-               goto no_yield;
+       /* target guest VCPU already running */
+       if (READ_ONCE(tcpu->cpu) >= 0) {
+               if (!diag9c_forwarding_hz || diag9c_forwarding_overrun())
+                       goto no_yield;
+
+               /* target host CPU already running */
+               if (!vcpu_is_preempted(tcpu->cpu))
+                       goto no_yield;
+               smp_yield_cpu(tcpu->cpu);
+               VCPU_EVENT(vcpu, 5,
+                          "diag time slice end directed to %d: yield forwarded",
+                          tid);
+               vcpu->stat.diagnose_9c_forward++;
+               return 0;
+       }
 
        if (kvm_vcpu_yield_to(tcpu) <= 0)
                goto no_yield;
index 6d6b570..b9f85b2 100644 (file)
@@ -976,7 +976,9 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
  * kvm_s390_shadow_tables - walk the guest page table and create shadow tables
  * @sg: pointer to the shadow guest address space structure
  * @saddr: faulting address in the shadow gmap
- * @pgt: pointer to the page table address result
+ * @pgt: pointer to the beginning of the page table for the given address if
+ *      successful (return value 0), or to the first invalid DAT entry in
+ *      case of exceptions (return value > 0)
  * @fake: pgt references contiguous guest memory block, not a pgtable
  */
 static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
@@ -1034,6 +1036,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
                        rfte.val = ptr;
                        goto shadow_r2t;
                }
+               *pgt = ptr + vaddr.rfx * 8;
                rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
                if (rc)
                        return rc;
@@ -1060,6 +1063,7 @@ shadow_r2t:
                        rste.val = ptr;
                        goto shadow_r3t;
                }
+               *pgt = ptr + vaddr.rsx * 8;
                rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
                if (rc)
                        return rc;
@@ -1087,6 +1091,7 @@ shadow_r3t:
                        rtte.val = ptr;
                        goto shadow_sgt;
                }
+               *pgt = ptr + vaddr.rtx * 8;
                rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
                if (rc)
                        return rc;
@@ -1123,6 +1128,7 @@ shadow_sgt:
                        ste.val = ptr;
                        goto shadow_pgt;
                }
+               *pgt = ptr + vaddr.sx * 8;
                rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
                if (rc)
                        return rc;
@@ -1157,6 +1163,8 @@ shadow_pgt:
  * @vcpu: virtual cpu
  * @sg: pointer to the shadow guest address space structure
  * @saddr: faulting address in the shadow gmap
+ * @datptr: will contain the address of the faulting DAT table entry, or of
+ *         the valid leaf, plus some flags
  *
  * Returns: - 0 if the shadow fault was successfully resolved
  *         - > 0 (pgm exception code) on exceptions while faulting
@@ -1165,11 +1173,11 @@ shadow_pgt:
  *         - -ENOMEM if out of memory
  */
 int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
-                         unsigned long saddr)
+                         unsigned long saddr, unsigned long *datptr)
 {
        union vaddress vaddr;
        union page_table_entry pte;
-       unsigned long pgt;
+       unsigned long pgt = 0;
        int dat_protection, fake;
        int rc;
 
@@ -1191,8 +1199,20 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
                pte.val = pgt + vaddr.px * PAGE_SIZE;
                goto shadow_page;
        }
-       if (!rc)
-               rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+
+       switch (rc) {
+       case PGM_SEGMENT_TRANSLATION:
+       case PGM_REGION_THIRD_TRANS:
+       case PGM_REGION_SECOND_TRANS:
+       case PGM_REGION_FIRST_TRANS:
+               pgt |= PEI_NOT_PTE;
+               break;
+       case 0:
+               pgt += vaddr.px * 8;
+               rc = gmap_read_table(sg->parent, pgt, &pte.val);
+       }
+       if (datptr)
+               *datptr = pgt | dat_protection * PEI_DAT_PROT;
        if (!rc && pte.i)
                rc = PGM_PAGE_TRANSLATION;
        if (!rc && pte.z)
index f4c5175..7c72a5e 100644 (file)
 
 /**
  * kvm_s390_real_to_abs - convert guest real address to guest absolute address
- * @vcpu - guest virtual cpu
+ * @prefix - guest prefix
  * @gra - guest real address
  *
  * Returns the guest absolute address that corresponds to the passed guest real
- * address @gra of a virtual guest cpu by applying its prefix.
+ * address @gra of by applying the given prefix.
  */
-static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
-                                                unsigned long gra)
+static inline unsigned long _kvm_s390_real_to_abs(u32 prefix, unsigned long gra)
 {
-       unsigned long prefix  = kvm_s390_get_prefix(vcpu);
-
        if (gra < 2 * PAGE_SIZE)
                gra += prefix;
        else if (gra >= prefix && gra < prefix + 2 * PAGE_SIZE)
@@ -36,6 +33,43 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
        return gra;
 }
 
+/**
+ * kvm_s390_real_to_abs - convert guest real address to guest absolute address
+ * @vcpu - guest virtual cpu
+ * @gra - guest real address
+ *
+ * Returns the guest absolute address that corresponds to the passed guest real
+ * address @gra of a virtual guest cpu by applying its prefix.
+ */
+static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
+                                                unsigned long gra)
+{
+       return _kvm_s390_real_to_abs(kvm_s390_get_prefix(vcpu), gra);
+}
+
+/**
+ * _kvm_s390_logical_to_effective - convert guest logical to effective address
+ * @psw: psw of the guest
+ * @ga: guest logical address
+ *
+ * Convert a guest logical address to an effective address by applying the
+ * rules of the addressing mode defined by bits 31 and 32 of the given PSW
+ * (extendended/basic addressing mode).
+ *
+ * Depending on the addressing mode, the upper 40 bits (24 bit addressing
+ * mode), 33 bits (31 bit addressing mode) or no bits (64 bit addressing
+ * mode) of @ga will be zeroed and the remaining bits will be returned.
+ */
+static inline unsigned long _kvm_s390_logical_to_effective(psw_t *psw,
+                                                          unsigned long ga)
+{
+       if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
+               return ga;
+       if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
+               return ga & ((1UL << 31) - 1);
+       return ga & ((1UL << 24) - 1);
+}
+
 /**
  * kvm_s390_logical_to_effective - convert guest logical to effective address
  * @vcpu: guest virtual cpu
@@ -52,13 +86,7 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
 static inline unsigned long kvm_s390_logical_to_effective(struct kvm_vcpu *vcpu,
                                                          unsigned long ga)
 {
-       psw_t *psw = &vcpu->arch.sie_block->gpsw;
-
-       if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
-               return ga;
-       if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
-               return ga & ((1UL << 31) - 1);
-       return ga & ((1UL << 24) - 1);
+       return _kvm_s390_logical_to_effective(&vcpu->arch.sie_block->gpsw, ga);
 }
 
 /*
@@ -359,7 +387,11 @@ void ipte_unlock(struct kvm_vcpu *vcpu);
 int ipte_lock_held(struct kvm_vcpu *vcpu);
 int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
 
+/* MVPG PEI indication bits */
+#define PEI_DAT_PROT 2
+#define PEI_NOT_PTE 4
+
 int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
-                         unsigned long saddr);
+                         unsigned long saddr, unsigned long *datptr);
 
 #endif /* __KVM_S390_GACCESS_H */
index 2f09e9d..1296fc1 100644 (file)
@@ -158,6 +158,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        VCPU_STAT("instruction_diag_44", diagnose_44),
        VCPU_STAT("instruction_diag_9c", diagnose_9c),
        VCPU_STAT("diag_9c_ignored", diagnose_9c_ignored),
+       VCPU_STAT("diag_9c_forward", diagnose_9c_forward),
        VCPU_STAT("instruction_diag_258", diagnose_258),
        VCPU_STAT("instruction_diag_308", diagnose_308),
        VCPU_STAT("instruction_diag_500", diagnose_500),
@@ -185,6 +186,11 @@ static bool use_gisa  = true;
 module_param(use_gisa, bool, 0644);
 MODULE_PARM_DESC(use_gisa, "Use the GISA if the host supports it.");
 
+/* maximum diag9c forwarding per second */
+unsigned int diag9c_forwarding_hz;
+module_param(diag9c_forwarding_hz, uint, 0644);
+MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off");
+
 /*
  * For now we handle at most 16 double words as this is what the s390 base
  * kernel handles and stores in the prefix page. If we ever need to go beyond
@@ -544,6 +550,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_S390_DIAG318:
                r = 1;
                break;
+       case KVM_CAP_SET_GUEST_DEBUG2:
+               r = KVM_GUESTDBG_VALID_MASK;
+               break;
        case KVM_CAP_S390_HPAGE_1M:
                r = 0;
                if (hpage && !kvm_is_ucontrol(kvm))
@@ -4307,16 +4316,16 @@ static void store_regs_fmt2(struct kvm_vcpu *vcpu)
        kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC;
        kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val;
        if (MACHINE_HAS_GS) {
+               preempt_disable();
                __ctl_set_bit(2, 4);
                if (vcpu->arch.gs_enabled)
                        save_gs_cb(current->thread.gs_cb);
-               preempt_disable();
                current->thread.gs_cb = vcpu->arch.host_gscb;
                restore_gs_cb(vcpu->arch.host_gscb);
-               preempt_enable();
                if (!vcpu->arch.host_gscb)
                        __ctl_clear_bit(2, 4);
                vcpu->arch.host_gscb = NULL;
+               preempt_enable();
        }
        /* SIE will save etoken directly into SDNX and therefore kvm_run */
 }
@@ -4542,7 +4551,7 @@ int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
                /*
                 * As we are starting a second VCPU, we have to disable
                 * the IBS facility on all VCPUs to remove potentially
-                * oustanding ENABLE requests.
+                * outstanding ENABLE requests.
                 */
                __disable_ibs_on_all_vcpus(vcpu->kvm);
        }
index 79dcd64..9fad251 100644 (file)
@@ -471,4 +471,12 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
  * @kvm: the KVM guest
  */
 void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm);
+
+/**
+ * diag9c_forwarding_hz
+ *
+ * Set the maximum number of diag9c forwarding per second
+ */
+extern unsigned int diag9c_forwarding_hz;
+
 #endif
index bd803e0..4002a24 100644 (file)
@@ -417,11 +417,6 @@ static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
                memcpy((void *)((u64)scb_o + 0xc0),
                       (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
                break;
-       case ICPT_PARTEXEC:
-               /* MVPG only */
-               memcpy((void *)((u64)scb_o + 0xc0),
-                      (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
-               break;
        }
 
        if (scb_s->ihcpu != 0xffffU)
@@ -620,10 +615,10 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
        /* with mso/msl, the prefix lies at offset *mso* */
        prefix += scb_s->mso;
 
-       rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+       rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL);
        if (!rc && (scb_s->ecb & ECB_TE))
                rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
-                                          prefix + PAGE_SIZE);
+                                          prefix + PAGE_SIZE, NULL);
        /*
         * We don't have to mprotect, we will be called for all unshadows.
         * SIE will detect if protection applies and trigger a validity.
@@ -914,7 +909,7 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
                                    current->thread.gmap_addr, 1);
 
        rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
-                                  current->thread.gmap_addr);
+                                  current->thread.gmap_addr, NULL);
        if (rc > 0) {
                rc = inject_fault(vcpu, rc,
                                  current->thread.gmap_addr,
@@ -936,7 +931,7 @@ static void handle_last_fault(struct kvm_vcpu *vcpu,
 {
        if (vsie_page->fault_addr)
                kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
-                                     vsie_page->fault_addr);
+                                     vsie_page->fault_addr, NULL);
        vsie_page->fault_addr = 0;
 }
 
@@ -983,6 +978,98 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
        return 0;
 }
 
+/*
+ * Get a register for a nested guest.
+ * @vcpu the vcpu of the guest
+ * @vsie_page the vsie_page for the nested guest
+ * @reg the register number, the upper 4 bits are ignored.
+ * returns: the value of the register.
+ */
+static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, u8 reg)
+{
+       /* no need to validate the parameter and/or perform error handling */
+       reg &= 0xf;
+       switch (reg) {
+       case 15:
+               return vsie_page->scb_s.gg15;
+       case 14:
+               return vsie_page->scb_s.gg14;
+       default:
+               return vcpu->run->s.regs.gprs[reg];
+       }
+}
+
+static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       unsigned long pei_dest, pei_src, src, dest, mask, prefix;
+       u64 *pei_block = &vsie_page->scb_o->mcic;
+       int edat, rc_dest, rc_src;
+       union ctlreg0 cr0;
+
+       cr0.val = vcpu->arch.sie_block->gcr[0];
+       edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
+       mask = _kvm_s390_logical_to_effective(&scb_s->gpsw, PAGE_MASK);
+       prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
+
+       dest = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 20) & mask;
+       dest = _kvm_s390_real_to_abs(prefix, dest) + scb_s->mso;
+       src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask;
+       src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso;
+
+       rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest);
+       rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src);
+       /*
+        * Either everything went well, or something non-critical went wrong
+        * e.g. because of a race. In either case, simply retry.
+        */
+       if (rc_dest == -EAGAIN || rc_src == -EAGAIN || (!rc_dest && !rc_src)) {
+               retry_vsie_icpt(vsie_page);
+               return -EAGAIN;
+       }
+       /* Something more serious went wrong, propagate the error */
+       if (rc_dest < 0)
+               return rc_dest;
+       if (rc_src < 0)
+               return rc_src;
+
+       /* The only possible suppressing exception: just deliver it */
+       if (rc_dest == PGM_TRANSLATION_SPEC || rc_src == PGM_TRANSLATION_SPEC) {
+               clear_vsie_icpt(vsie_page);
+               rc_dest = kvm_s390_inject_program_int(vcpu, PGM_TRANSLATION_SPEC);
+               WARN_ON_ONCE(rc_dest);
+               return 1;
+       }
+
+       /*
+        * Forward the PEI intercept to the guest if it was a page fault, or
+        * also for segment and region table faults if EDAT applies.
+        */
+       if (edat) {
+               rc_dest = rc_dest == PGM_ASCE_TYPE ? rc_dest : 0;
+               rc_src = rc_src == PGM_ASCE_TYPE ? rc_src : 0;
+       } else {
+               rc_dest = rc_dest != PGM_PAGE_TRANSLATION ? rc_dest : 0;
+               rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0;
+       }
+       if (!rc_dest && !rc_src) {
+               pei_block[0] = pei_dest;
+               pei_block[1] = pei_src;
+               return 1;
+       }
+
+       retry_vsie_icpt(vsie_page);
+
+       /*
+        * The host has edat, and the guest does not, or it was an ASCE type
+        * exception. The host needs to inject the appropriate DAT interrupts
+        * into the guest.
+        */
+       if (rc_dest)
+               return inject_fault(vcpu, rc_dest, dest, 1);
+       return inject_fault(vcpu, rc_src, src, 0);
+}
+
 /*
  * Run the vsie on a shadow scb and a shadow gmap, without any further
  * sanity checks, handling SIE faults.
@@ -1071,6 +1158,10 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
                if ((scb_s->ipa & 0xf000) != 0xf000)
                        scb_s->ipa += 0x1000;
                break;
+       case ICPT_PARTEXEC:
+               if (scb_s->ipa == 0xb254)
+                       rc = vsie_handle_mvpg(vcpu, vsie_page);
+               break;
        }
        return rc;
 }
index 2792879..35391e9 100644 (file)
@@ -1931,6 +1931,7 @@ config X86_SGX
        depends on CRYPTO_SHA256=y
        select SRCU
        select MMU_NOTIFIER
+       select NUMA_KEEP_MEMINFO if NUMA
        help
          Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions
          that can be used by applications to set aside private regions of code
index cc96e26..dddc746 100644 (file)
 #define X86_FEATURE_FENCE_SWAPGS_KERNEL        (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
 #define X86_FEATURE_SPLIT_LOCK_DETECT  (11*32+ 6) /* #AC for split lock */
 #define X86_FEATURE_PER_THREAD_MBA     (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
+#define X86_FEATURE_SGX1               (11*32+ 8) /* "" Basic SGX */
+#define X86_FEATURE_SGX2               (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
 #define X86_FEATURE_AVX_VNNI           (12*32+ 4) /* AVX VNNI instructions */
 #define X86_FEATURE_AVIC               (15*32+13) /* Virtual Interrupt Controller */
 #define X86_FEATURE_V_VMSAVE_VMLOAD    (15*32+15) /* Virtual VMSAVE VMLOAD */
 #define X86_FEATURE_VGIF               (15*32+16) /* Virtual GIF */
+#define X86_FEATURE_V_SPEC_CTRL                (15*32+20) /* Virtual SPEC_CTRL */
 #define X86_FEATURE_SVME_ADDR_CHK      (15*32+28) /* "" SVME addr check */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
index 3768819..ad22d48 100644 (file)
@@ -221,12 +221,22 @@ enum x86_intercept_stage;
 #define DR7_FIXED_1    0x00000400
 #define DR7_VOLATILE   0xffff2bff
 
+#define KVM_GUESTDBG_VALID_MASK \
+       (KVM_GUESTDBG_ENABLE | \
+       KVM_GUESTDBG_SINGLESTEP | \
+       KVM_GUESTDBG_USE_HW_BP | \
+       KVM_GUESTDBG_USE_SW_BP | \
+       KVM_GUESTDBG_INJECT_BP | \
+       KVM_GUESTDBG_INJECT_DB)
+
+
 #define PFERR_PRESENT_BIT 0
 #define PFERR_WRITE_BIT 1
 #define PFERR_USER_BIT 2
 #define PFERR_RSVD_BIT 3
 #define PFERR_FETCH_BIT 4
 #define PFERR_PK_BIT 5
+#define PFERR_SGX_BIT 15
 #define PFERR_GUEST_FINAL_BIT 32
 #define PFERR_GUEST_PAGE_BIT 33
 
@@ -236,6 +246,7 @@ enum x86_intercept_stage;
 #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
 #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
 #define PFERR_PK_MASK (1U << PFERR_PK_BIT)
+#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT)
 #define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
 #define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
 
@@ -1054,6 +1065,9 @@ struct kvm_arch {
        u32 user_space_msr_mask;
        struct kvm_x86_msr_filter __rcu *msr_filter;
 
+       /* Guest can access the SGX PROVISIONKEY. */
+       bool sgx_provisioning_allowed;
+
        struct kvm_pmu_event_filter __rcu *pmu_event_filter;
        struct task_struct *nx_lpage_recovery_thread;
 
@@ -1068,25 +1082,36 @@ struct kvm_arch {
        bool tdp_mmu_enabled;
 
        /*
-        * List of struct kvmp_mmu_pages being used as roots.
+        * List of struct kvm_mmu_pages being used as roots.
         * All struct kvm_mmu_pages in the list should have
         * tdp_mmu_page set.
-        * All struct kvm_mmu_pages in the list should have a positive
-        * root_count except when a thread holds the MMU lock and is removing
-        * an entry from the list.
+        *
+        * For reads, this list is protected by:
+        *      the MMU lock in read mode + RCU or
+        *      the MMU lock in write mode
+        *
+        * For writes, this list is protected by:
+        *      the MMU lock in read mode + the tdp_mmu_pages_lock or
+        *      the MMU lock in write mode
+        *
+        * Roots will remain in the list until their tdp_mmu_root_count
+        * drops to zero, at which point the thread that decremented the
+        * count to zero should removed the root from the list and clean
+        * it up, freeing the root after an RCU grace period.
         */
        struct list_head tdp_mmu_roots;
 
        /*
         * List of struct kvmp_mmu_pages not being used as roots.
         * All struct kvm_mmu_pages in the list should have
-        * tdp_mmu_page set and a root_count of 0.
+        * tdp_mmu_page set and a tdp_mmu_root_count of 0.
         */
        struct list_head tdp_mmu_pages;
 
        /*
         * Protects accesses to the following fields when the MMU lock
         * is held in read mode:
+        *  - tdp_mmu_roots (above)
         *  - tdp_mmu_pages (above)
         *  - the link field of struct kvm_mmu_pages used by the TDP MMU
         *  - lpage_disallowed_mmu_pages
@@ -1143,6 +1168,9 @@ struct kvm_vcpu_stat {
        u64 req_event;
        u64 halt_poll_success_ns;
        u64 halt_poll_fail_ns;
+       u64 nested_run;
+       u64 directed_yield_attempted;
+       u64 directed_yield_successful;
 };
 
 struct x86_instruction_info;
@@ -1269,8 +1297,8 @@ struct kvm_x86_ops {
        int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
        u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
 
-       void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long pgd,
-                            int pgd_level);
+       void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+                            int root_level);
 
        bool (*has_wbinvd_exit)(void);
 
@@ -1339,6 +1367,7 @@ struct kvm_x86_ops {
        int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
        int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
        int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+       int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
 
        int (*get_msr_feature)(struct kvm_msr_entry *entry);
 
@@ -1357,6 +1386,7 @@ struct kvm_x86_ops {
 struct kvm_x86_nested_ops {
        int (*check_events)(struct kvm_vcpu *vcpu);
        bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
+       void (*triple_fault)(struct kvm_vcpu *vcpu);
        int (*get_state)(struct kvm_vcpu *vcpu,
                         struct kvm_nested_state __user *user_kvm_nested_state,
                         unsigned user_data_size);
@@ -1428,9 +1458,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
 void kvm_mmu_init_vm(struct kvm *kvm);
 void kvm_mmu_uninit_vm(struct kvm *kvm);
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-               u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
-               u64 acc_track_mask, u64 me_mask);
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -1440,8 +1467,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
                                   const struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot);
-void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
-                                       struct kvm_memory_slot *memslot);
 void kvm_mmu_zap_all(struct kvm *kvm);
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
 unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
@@ -1538,6 +1563,11 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data);
 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data);
 int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu);
 int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu);
+int kvm_emulate_as_nop(struct kvm_vcpu *vcpu);
+int kvm_emulate_invd(struct kvm_vcpu *vcpu);
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu);
+int kvm_handle_invalid_op(struct kvm_vcpu *vcpu);
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu);
 
 int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
 int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
@@ -1566,14 +1596,14 @@ void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
-int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
+int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu);
 
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
 
 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
-bool kvm_rdpmc(struct kvm_vcpu *vcpu);
+int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu);
 
 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
@@ -1614,9 +1644,6 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu);
 
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
-int kvm_mmu_load(struct kvm_vcpu *vcpu);
-void kvm_mmu_unload(struct kvm_vcpu *vcpu);
-void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
 void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                        ulong roots_to_free);
 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
@@ -1735,11 +1762,7 @@ asmlinkage void kvm_spurious_fault(void);
        _ASM_EXTABLE(666b, 667b)
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
-                       unsigned flags);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+
 int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_cpu_has_extint(struct kvm_vcpu *v);
diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
new file mode 100644 (file)
index 0000000..a16e2c9
--- /dev/null
@@ -0,0 +1,378 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/**
+ * Copyright(c) 2016-20 Intel Corporation.
+ *
+ * Intel Software Guard Extensions (SGX) support.
+ */
+#ifndef _ASM_X86_SGX_H
+#define _ASM_X86_SGX_H
+
+#include <linux/bits.h>
+#include <linux/types.h>
+
+/*
+ * This file contains both data structures defined by SGX architecture and Linux
+ * defined software data structures and functions.  The two should not be mixed
+ * together for better readibility.  The architectural definitions come first.
+ */
+
+/* The SGX specific CPUID function. */
+#define SGX_CPUID              0x12
+/* EPC enumeration. */
+#define SGX_CPUID_EPC          2
+/* An invalid EPC section, i.e. the end marker. */
+#define SGX_CPUID_EPC_INVALID  0x0
+/* A valid EPC section. */
+#define SGX_CPUID_EPC_SECTION  0x1
+/* The bitmask for the EPC section type. */
+#define SGX_CPUID_EPC_MASK     GENMASK(3, 0)
+
+enum sgx_encls_function {
+       ECREATE = 0x00,
+       EADD    = 0x01,
+       EINIT   = 0x02,
+       EREMOVE = 0x03,
+       EDGBRD  = 0x04,
+       EDGBWR  = 0x05,
+       EEXTEND = 0x06,
+       ELDU    = 0x08,
+       EBLOCK  = 0x09,
+       EPA     = 0x0A,
+       EWB     = 0x0B,
+       ETRACK  = 0x0C,
+       EAUG    = 0x0D,
+       EMODPR  = 0x0E,
+       EMODT   = 0x0F,
+};
+
+/**
+ * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
+ * %SGX_NOT_TRACKED:           Previous ETRACK's shootdown sequence has not
+ *                             been completed yet.
+ * %SGX_CHILD_PRESENT          SECS has child pages present in the EPC.
+ * %SGX_INVALID_EINITTOKEN:    EINITTOKEN is invalid and enclave signer's
+ *                             public key does not match IA32_SGXLEPUBKEYHASH.
+ * %SGX_UNMASKED_EVENT:                An unmasked event, e.g. INTR, was received
+ */
+enum sgx_return_code {
+       SGX_NOT_TRACKED                 = 11,
+       SGX_CHILD_PRESENT               = 13,
+       SGX_INVALID_EINITTOKEN          = 16,
+       SGX_UNMASKED_EVENT              = 128,
+};
+
+/* The modulus size for 3072-bit RSA keys. */
+#define SGX_MODULUS_SIZE 384
+
+/**
+ * enum sgx_miscselect - additional information to an SSA frame
+ * %SGX_MISC_EXINFO:   Report #PF or #GP to the SSA frame.
+ *
+ * Save State Area (SSA) is a stack inside the enclave used to store processor
+ * state when an exception or interrupt occurs. This enum defines additional
+ * information stored to an SSA frame.
+ */
+enum sgx_miscselect {
+       SGX_MISC_EXINFO         = BIT(0),
+};
+
+#define SGX_MISC_RESERVED_MASK GENMASK_ULL(63, 1)
+
+#define SGX_SSA_GPRS_SIZE              184
+#define SGX_SSA_MISC_EXINFO_SIZE       16
+
+/**
+ * enum sgx_attributes - the attributes field in &struct sgx_secs
+ * %SGX_ATTR_INIT:             Enclave can be entered (is initialized).
+ * %SGX_ATTR_DEBUG:            Allow ENCLS(EDBGRD) and ENCLS(EDBGWR).
+ * %SGX_ATTR_MODE64BIT:                Tell that this a 64-bit enclave.
+ * %SGX_ATTR_PROVISIONKEY:      Allow to use provisioning keys for remote
+ *                             attestation.
+ * %SGX_ATTR_KSS:              Allow to use key separation and sharing (KSS).
+ * %SGX_ATTR_EINITTOKENKEY:    Allow to use token signing key that is used to
+ *                             sign cryptographic tokens that can be passed to
+ *                             EINIT as an authorization to run an enclave.
+ */
+enum sgx_attribute {
+       SGX_ATTR_INIT           = BIT(0),
+       SGX_ATTR_DEBUG          = BIT(1),
+       SGX_ATTR_MODE64BIT      = BIT(2),
+       SGX_ATTR_PROVISIONKEY   = BIT(4),
+       SGX_ATTR_EINITTOKENKEY  = BIT(5),
+       SGX_ATTR_KSS            = BIT(7),
+};
+
+#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | BIT_ULL(6) | GENMASK_ULL(63, 8))
+
+/**
+ * struct sgx_secs - SGX Enclave Control Structure (SECS)
+ * @size:              size of the address space
+ * @base:              base address of the  address space
+ * @ssa_frame_size:    size of an SSA frame
+ * @miscselect:                additional information stored to an SSA frame
+ * @attributes:                attributes for enclave
+ * @xfrm:              XSave-Feature Request Mask (subset of XCR0)
+ * @mrenclave:         SHA256-hash of the enclave contents
+ * @mrsigner:          SHA256-hash of the public key used to sign the SIGSTRUCT
+ * @config_id:         a user-defined value that is used in key derivation
+ * @isv_prod_id:       a user-defined value that is used in key derivation
+ * @isv_svn:           a user-defined value that is used in key derivation
+ * @config_svn:                a user-defined value that is used in key derivation
+ *
+ * SGX Enclave Control Structure (SECS) is a special enclave page that is not
+ * visible in the address space. In fact, this structure defines the address
+ * range and other global attributes for the enclave and it is the first EPC
+ * page created for any enclave. It is moved from a temporary buffer to an EPC
+ * by the means of ENCLS[ECREATE] function.
+ */
+struct sgx_secs {
+       u64 size;
+       u64 base;
+       u32 ssa_frame_size;
+       u32 miscselect;
+       u8  reserved1[24];
+       u64 attributes;
+       u64 xfrm;
+       u32 mrenclave[8];
+       u8  reserved2[32];
+       u32 mrsigner[8];
+       u8  reserved3[32];
+       u32 config_id[16];
+       u16 isv_prod_id;
+       u16 isv_svn;
+       u16 config_svn;
+       u8  reserved4[3834];
+} __packed;
+
+/**
+ * enum sgx_tcs_flags - execution flags for TCS
+ * %SGX_TCS_DBGOPTIN:  If enabled allows single-stepping and breakpoints
+ *                     inside an enclave. It is cleared by EADD but can
+ *                     be set later with EDBGWR.
+ */
+enum sgx_tcs_flags {
+       SGX_TCS_DBGOPTIN        = 0x01,
+};
+
+#define SGX_TCS_RESERVED_MASK  GENMASK_ULL(63, 1)
+#define SGX_TCS_RESERVED_SIZE  4024
+
+/**
+ * struct sgx_tcs - Thread Control Structure (TCS)
+ * @state:             used to mark an entered TCS
+ * @flags:             execution flags (cleared by EADD)
+ * @ssa_offset:                SSA stack offset relative to the enclave base
+ * @ssa_index:         the current SSA frame index (cleard by EADD)
+ * @nr_ssa_frames:     the number of frame in the SSA stack
+ * @entry_offset:      entry point offset relative to the enclave base
+ * @exit_addr:         address outside the enclave to exit on an exception or
+ *                     interrupt
+ * @fs_offset:         offset relative to the enclave base to become FS
+ *                     segment inside the enclave
+ * @gs_offset:         offset relative to the enclave base to become GS
+ *                     segment inside the enclave
+ * @fs_limit:          size to become a new FS-limit (only 32-bit enclaves)
+ * @gs_limit:          size to become a new GS-limit (only 32-bit enclaves)
+ *
+ * Thread Control Structure (TCS) is an enclave page visible in its address
+ * space that defines an entry point inside the enclave. A thread enters inside
+ * an enclave by supplying address of TCS to ENCLU(EENTER). A TCS can be entered
+ * by only one thread at a time.
+ */
+struct sgx_tcs {
+       u64 state;
+       u64 flags;
+       u64 ssa_offset;
+       u32 ssa_index;
+       u32 nr_ssa_frames;
+       u64 entry_offset;
+       u64 exit_addr;
+       u64 fs_offset;
+       u64 gs_offset;
+       u32 fs_limit;
+       u32 gs_limit;
+       u8  reserved[SGX_TCS_RESERVED_SIZE];
+} __packed;
+
+/**
+ * struct sgx_pageinfo - an enclave page descriptor
+ * @addr:      address of the enclave page
+ * @contents:  pointer to the page contents
+ * @metadata:  pointer either to a SECINFO or PCMD instance
+ * @secs:      address of the SECS page
+ */
+struct sgx_pageinfo {
+       u64 addr;
+       u64 contents;
+       u64 metadata;
+       u64 secs;
+} __packed __aligned(32);
+
+
+/**
+ * enum sgx_page_type - bits in the SECINFO flags defining the page type
+ * %SGX_PAGE_TYPE_SECS:        a SECS page
+ * %SGX_PAGE_TYPE_TCS: a TCS page
+ * %SGX_PAGE_TYPE_REG: a regular page
+ * %SGX_PAGE_TYPE_VA:  a VA page
+ * %SGX_PAGE_TYPE_TRIM:        a page in trimmed state
+ */
+enum sgx_page_type {
+       SGX_PAGE_TYPE_SECS,
+       SGX_PAGE_TYPE_TCS,
+       SGX_PAGE_TYPE_REG,
+       SGX_PAGE_TYPE_VA,
+       SGX_PAGE_TYPE_TRIM,
+};
+
+#define SGX_NR_PAGE_TYPES      5
+#define SGX_PAGE_TYPE_MASK     GENMASK(7, 0)
+
+/**
+ * enum sgx_secinfo_flags - the flags field in &struct sgx_secinfo
+ * %SGX_SECINFO_R:     allow read
+ * %SGX_SECINFO_W:     allow write
+ * %SGX_SECINFO_X:     allow execution
+ * %SGX_SECINFO_SECS:  a SECS page
+ * %SGX_SECINFO_TCS:   a TCS page
+ * %SGX_SECINFO_REG:   a regular page
+ * %SGX_SECINFO_VA:    a VA page
+ * %SGX_SECINFO_TRIM:  a page in trimmed state
+ */
+enum sgx_secinfo_flags {
+       SGX_SECINFO_R                   = BIT(0),
+       SGX_SECINFO_W                   = BIT(1),
+       SGX_SECINFO_X                   = BIT(2),
+       SGX_SECINFO_SECS                = (SGX_PAGE_TYPE_SECS << 8),
+       SGX_SECINFO_TCS                 = (SGX_PAGE_TYPE_TCS << 8),
+       SGX_SECINFO_REG                 = (SGX_PAGE_TYPE_REG << 8),
+       SGX_SECINFO_VA                  = (SGX_PAGE_TYPE_VA << 8),
+       SGX_SECINFO_TRIM                = (SGX_PAGE_TYPE_TRIM << 8),
+};
+
+#define SGX_SECINFO_PERMISSION_MASK    GENMASK_ULL(2, 0)
+#define SGX_SECINFO_PAGE_TYPE_MASK     (SGX_PAGE_TYPE_MASK << 8)
+#define SGX_SECINFO_RESERVED_MASK      ~(SGX_SECINFO_PERMISSION_MASK | \
+                                         SGX_SECINFO_PAGE_TYPE_MASK)
+
+/**
+ * struct sgx_secinfo - describes attributes of an EPC page
+ * @flags:     permissions and type
+ *
+ * Used together with ENCLS leaves that add or modify an EPC page to an
+ * enclave to define page permissions and type.
+ */
+struct sgx_secinfo {
+       u64 flags;
+       u8  reserved[56];
+} __packed __aligned(64);
+
+#define SGX_PCMD_RESERVED_SIZE 40
+
+/**
+ * struct sgx_pcmd - Paging Crypto Metadata (PCMD)
+ * @enclave_id:        enclave identifier
+ * @mac:       MAC over PCMD, page contents and isvsvn
+ *
+ * PCMD is stored for every swapped page to the regular memory. When ELDU loads
+ * the page back it recalculates the MAC by using a isvsvn number stored in a
+ * VA page. Together these two structures bring integrity and rollback
+ * protection.
+ */
+struct sgx_pcmd {
+       struct sgx_secinfo secinfo;
+       u64 enclave_id;
+       u8  reserved[SGX_PCMD_RESERVED_SIZE];
+       u8  mac[16];
+} __packed __aligned(128);
+
+#define SGX_SIGSTRUCT_RESERVED1_SIZE 84
+#define SGX_SIGSTRUCT_RESERVED2_SIZE 20
+#define SGX_SIGSTRUCT_RESERVED3_SIZE 32
+#define SGX_SIGSTRUCT_RESERVED4_SIZE 12
+
+/**
+ * struct sgx_sigstruct_header -  defines author of the enclave
+ * @header1:           constant byte string
+ * @vendor:            must be either 0x0000 or 0x8086
+ * @date:              YYYYMMDD in BCD
+ * @header2:           costant byte string
+ * @swdefined:         software defined value
+ */
+struct sgx_sigstruct_header {
+       u64 header1[2];
+       u32 vendor;
+       u32 date;
+       u64 header2[2];
+       u32 swdefined;
+       u8  reserved1[84];
+} __packed;
+
+/**
+ * struct sgx_sigstruct_body - defines contents of the enclave
+ * @miscselect:                additional information stored to an SSA frame
+ * @misc_mask:         required miscselect in SECS
+ * @attributes:                attributes for enclave
+ * @xfrm:              XSave-Feature Request Mask (subset of XCR0)
+ * @attributes_mask:   required attributes in SECS
+ * @xfrm_mask:         required XFRM in SECS
+ * @mrenclave:         SHA256-hash of the enclave contents
+ * @isvprodid:         a user-defined value that is used in key derivation
+ * @isvsvn:            a user-defined value that is used in key derivation
+ */
+struct sgx_sigstruct_body {
+       u32 miscselect;
+       u32 misc_mask;
+       u8  reserved2[20];
+       u64 attributes;
+       u64 xfrm;
+       u64 attributes_mask;
+       u64 xfrm_mask;
+       u8  mrenclave[32];
+       u8  reserved3[32];
+       u16 isvprodid;
+       u16 isvsvn;
+} __packed;
+
+/**
+ * struct sgx_sigstruct - an enclave signature
+ * @header:            defines author of the enclave
+ * @modulus:           the modulus of the public key
+ * @exponent:          the exponent of the public key
+ * @signature:         the signature calculated over the fields except modulus,
+ * @body:              defines contents of the enclave
+ * @q1:                        a value used in RSA signature verification
+ * @q2:                        a value used in RSA signature verification
+ *
+ * Header and body are the parts that are actual signed. The remaining fields
+ * define the signature of the enclave.
+ */
+struct sgx_sigstruct {
+       struct sgx_sigstruct_header header;
+       u8  modulus[SGX_MODULUS_SIZE];
+       u32 exponent;
+       u8  signature[SGX_MODULUS_SIZE];
+       struct sgx_sigstruct_body body;
+       u8  reserved4[12];
+       u8  q1[SGX_MODULUS_SIZE];
+       u8  q2[SGX_MODULUS_SIZE];
+} __packed;
+
+#define SGX_LAUNCH_TOKEN_SIZE 304
+
+/*
+ * Do not put any hardware-defined SGX structure representations below this
+ * comment!
+ */
+
+#ifdef CONFIG_X86_SGX_KVM
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+                    int *trapnr);
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+                  void __user *secs, u64 *lepubkeyhash, int *trapnr);
+#endif
+
+int sgx_set_attribute(unsigned long *allowed_attributes,
+                     unsigned int attribute_fd);
+
+#endif /* _ASM_X86_SGX_H */
index 1c56194..772e60e 100644 (file)
@@ -269,7 +269,9 @@ struct vmcb_save_area {
         * SEV-ES guests when referenced through the GHCB or for
         * saving to the host save area.
         */
-       u8 reserved_7[80];
+       u8 reserved_7[72];
+       u32 spec_ctrl;          /* Guest version of SPEC_CTRL at 0x2E0 */
+       u8 reserved_7b[4];
        u32 pkru;
        u8 reserved_7a[20];
        u64 reserved_8;         /* rax already available at 0x01f8 */
index 358707f..0ffaa31 100644 (file)
@@ -373,6 +373,7 @@ enum vmcs_field {
 #define GUEST_INTR_STATE_MOV_SS                0x00000002
 #define GUEST_INTR_STATE_SMI           0x00000004
 #define GUEST_INTR_STATE_NMI           0x00000008
+#define GUEST_INTR_STATE_ENCLAVE_INTR  0x00000010
 
 /* GUEST_ACTIVITY_STATE flags */
 #define GUEST_ACTIVITY_ACTIVE          0
index b8e650a..946d761 100644 (file)
@@ -27,6 +27,7 @@
 
 
 #define VMX_EXIT_REASONS_FAILED_VMENTRY         0x80000000
+#define VMX_EXIT_REASONS_SGX_ENCLAVE_MODE      0x08000000
 
 #define EXIT_REASON_EXCEPTION_NMI       0
 #define EXIT_REASON_EXTERNAL_INTERRUPT  1
index 42af31b..defda61 100644 (file)
@@ -72,6 +72,9 @@ static const struct cpuid_dep cpuid_deps[] = {
        { X86_FEATURE_AVX512_FP16,              X86_FEATURE_AVX512BW  },
        { X86_FEATURE_ENQCMD,                   X86_FEATURE_XSAVES    },
        { X86_FEATURE_PER_THREAD_MBA,           X86_FEATURE_MBA       },
+       { X86_FEATURE_SGX_LC,                   X86_FEATURE_SGX       },
+       { X86_FEATURE_SGX1,                     X86_FEATURE_SGX       },
+       { X86_FEATURE_SGX2,                     X86_FEATURE_SGX1      },
        {}
 };
 
index 3b1b01f..da696eb 100644 (file)
@@ -93,15 +93,9 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c)
 }
 #endif /* CONFIG_X86_VMX_FEATURE_NAMES */
 
-static void clear_sgx_caps(void)
-{
-       setup_clear_cpu_cap(X86_FEATURE_SGX);
-       setup_clear_cpu_cap(X86_FEATURE_SGX_LC);
-}
-
 static int __init nosgx(char *str)
 {
-       clear_sgx_caps();
+       setup_clear_cpu_cap(X86_FEATURE_SGX);
 
        return 0;
 }
@@ -110,23 +104,30 @@ early_param("nosgx", nosgx);
 
 void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
 {
+       bool enable_sgx_kvm = false, enable_sgx_driver = false;
        bool tboot = tboot_enabled();
-       bool enable_sgx;
+       bool enable_vmx;
        u64 msr;
 
        if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) {
                clear_cpu_cap(c, X86_FEATURE_VMX);
-               clear_sgx_caps();
+               clear_cpu_cap(c, X86_FEATURE_SGX);
                return;
        }
 
-       /*
-        * Enable SGX if and only if the kernel supports SGX and Launch Control
-        * is supported, i.e. disable SGX if the LE hash MSRs can't be written.
-        */
-       enable_sgx = cpu_has(c, X86_FEATURE_SGX) &&
-                    cpu_has(c, X86_FEATURE_SGX_LC) &&
-                    IS_ENABLED(CONFIG_X86_SGX);
+       enable_vmx = cpu_has(c, X86_FEATURE_VMX) &&
+                    IS_ENABLED(CONFIG_KVM_INTEL);
+
+       if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) {
+               /*
+                * Separate out SGX driver enabling from KVM.  This allows KVM
+                * guests to use SGX even if the kernel SGX driver refuses to
+                * use it.  This happens if flexible Launch Control is not
+                * available.
+                */
+               enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC);
+               enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM);
+       }
 
        if (msr & FEAT_CTL_LOCKED)
                goto update_caps;
@@ -142,15 +143,18 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
         * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector
         * for the kernel, e.g. using VMX to hide malicious code.
         */
-       if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) {
+       if (enable_vmx) {
                msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
 
                if (tboot)
                        msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX;
        }
 
-       if (enable_sgx)
-               msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED;
+       if (enable_sgx_kvm || enable_sgx_driver) {
+               msr |= FEAT_CTL_SGX_ENABLED;
+               if (enable_sgx_driver)
+                       msr |= FEAT_CTL_SGX_LC_ENABLED;
+       }
 
        wrmsrl(MSR_IA32_FEAT_CTL, msr);
 
@@ -173,10 +177,29 @@ update_caps:
        }
 
 update_sgx:
-       if (!(msr & FEAT_CTL_SGX_ENABLED) ||
-           !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) {
-               if (enable_sgx)
-                       pr_err_once("SGX disabled by BIOS\n");
-               clear_sgx_caps();
+       if (!(msr & FEAT_CTL_SGX_ENABLED)) {
+               if (enable_sgx_kvm || enable_sgx_driver)
+                       pr_err_once("SGX disabled by BIOS.\n");
+               clear_cpu_cap(c, X86_FEATURE_SGX);
+               return;
+       }
+
+       /*
+        * VMX feature bit may be cleared due to being disabled in BIOS,
+        * in which case SGX virtualization cannot be supported either.
+        */
+       if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) {
+               pr_err_once("SGX virtualization disabled due to lack of VMX.\n");
+               enable_sgx_kvm = 0;
+       }
+
+       if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) {
+               if (!enable_sgx_kvm) {
+                       pr_err_once("SGX Launch Control is locked. Disable SGX.\n");
+                       clear_cpu_cap(c, X86_FEATURE_SGX);
+               } else {
+                       pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n");
+                       clear_cpu_cap(c, X86_FEATURE_SGX_LC);
+               }
        }
 }
index 972ec3b..21d1f06 100644 (file)
@@ -36,6 +36,8 @@ static const struct cpuid_bit cpuid_bits[] = {
        { X86_FEATURE_CDP_L2,           CPUID_ECX,  2, 0x00000010, 2 },
        { X86_FEATURE_MBA,              CPUID_EBX,  3, 0x00000010, 0 },
        { X86_FEATURE_PER_THREAD_MBA,   CPUID_ECX,  0, 0x00000010, 3 },
+       { X86_FEATURE_SGX1,             CPUID_EAX,  0, 0x00000012, 0 },
+       { X86_FEATURE_SGX2,             CPUID_EAX,  1, 0x00000012, 0 },
        { X86_FEATURE_HW_PSTATE,        CPUID_EDX,  7, 0x80000007, 0 },
        { X86_FEATURE_CPB,              CPUID_EDX,  9, 0x80000007, 0 },
        { X86_FEATURE_PROC_FEEDBACK,    CPUID_EDX, 11, 0x80000007, 0 },
index 91d3dc7..9c16567 100644 (file)
@@ -3,3 +3,4 @@ obj-y += \
        encl.o \
        ioctl.o \
        main.o
+obj-$(CONFIG_X86_SGX_KVM)      += virt.o
diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/kernel/cpu/sgx/arch.h
deleted file mode 100644 (file)
index dd7602c..0000000
+++ /dev/null
@@ -1,338 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/**
- * Copyright(c) 2016-20 Intel Corporation.
- *
- * Contains data structures defined by the SGX architecture.  Data structures
- * defined by the Linux software stack should not be placed here.
- */
-#ifndef _ASM_X86_SGX_ARCH_H
-#define _ASM_X86_SGX_ARCH_H
-
-#include <linux/bits.h>
-#include <linux/types.h>
-
-/* The SGX specific CPUID function. */
-#define SGX_CPUID              0x12
-/* EPC enumeration. */
-#define SGX_CPUID_EPC          2
-/* An invalid EPC section, i.e. the end marker. */
-#define SGX_CPUID_EPC_INVALID  0x0
-/* A valid EPC section. */
-#define SGX_CPUID_EPC_SECTION  0x1
-/* The bitmask for the EPC section type. */
-#define SGX_CPUID_EPC_MASK     GENMASK(3, 0)
-
-/**
- * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
- * %SGX_NOT_TRACKED:           Previous ETRACK's shootdown sequence has not
- *                             been completed yet.
- * %SGX_INVALID_EINITTOKEN:    EINITTOKEN is invalid and enclave signer's
- *                             public key does not match IA32_SGXLEPUBKEYHASH.
- * %SGX_UNMASKED_EVENT:                An unmasked event, e.g. INTR, was received
- */
-enum sgx_return_code {
-       SGX_NOT_TRACKED                 = 11,
-       SGX_INVALID_EINITTOKEN          = 16,
-       SGX_UNMASKED_EVENT              = 128,
-};
-
-/* The modulus size for 3072-bit RSA keys. */
-#define SGX_MODULUS_SIZE 384
-
-/**
- * enum sgx_miscselect - additional information to an SSA frame
- * %SGX_MISC_EXINFO:   Report #PF or #GP to the SSA frame.
- *
- * Save State Area (SSA) is a stack inside the enclave used to store processor
- * state when an exception or interrupt occurs. This enum defines additional
- * information stored to an SSA frame.
- */
-enum sgx_miscselect {
-       SGX_MISC_EXINFO         = BIT(0),
-};
-
-#define SGX_MISC_RESERVED_MASK GENMASK_ULL(63, 1)
-
-#define SGX_SSA_GPRS_SIZE              184
-#define SGX_SSA_MISC_EXINFO_SIZE       16
-
-/**
- * enum sgx_attributes - the attributes field in &struct sgx_secs
- * %SGX_ATTR_INIT:             Enclave can be entered (is initialized).
- * %SGX_ATTR_DEBUG:            Allow ENCLS(EDBGRD) and ENCLS(EDBGWR).
- * %SGX_ATTR_MODE64BIT:                Tell that this a 64-bit enclave.
- * %SGX_ATTR_PROVISIONKEY:      Allow to use provisioning keys for remote
- *                             attestation.
- * %SGX_ATTR_KSS:              Allow to use key separation and sharing (KSS).
- * %SGX_ATTR_EINITTOKENKEY:    Allow to use token signing key that is used to
- *                             sign cryptographic tokens that can be passed to
- *                             EINIT as an authorization to run an enclave.
- */
-enum sgx_attribute {
-       SGX_ATTR_INIT           = BIT(0),
-       SGX_ATTR_DEBUG          = BIT(1),
-       SGX_ATTR_MODE64BIT      = BIT(2),
-       SGX_ATTR_PROVISIONKEY   = BIT(4),
-       SGX_ATTR_EINITTOKENKEY  = BIT(5),
-       SGX_ATTR_KSS            = BIT(7),
-};
-
-#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | BIT_ULL(6) | GENMASK_ULL(63, 8))
-
-/**
- * struct sgx_secs - SGX Enclave Control Structure (SECS)
- * @size:              size of the address space
- * @base:              base address of the  address space
- * @ssa_frame_size:    size of an SSA frame
- * @miscselect:                additional information stored to an SSA frame
- * @attributes:                attributes for enclave
- * @xfrm:              XSave-Feature Request Mask (subset of XCR0)
- * @mrenclave:         SHA256-hash of the enclave contents
- * @mrsigner:          SHA256-hash of the public key used to sign the SIGSTRUCT
- * @config_id:         a user-defined value that is used in key derivation
- * @isv_prod_id:       a user-defined value that is used in key derivation
- * @isv_svn:           a user-defined value that is used in key derivation
- * @config_svn:                a user-defined value that is used in key derivation
- *
- * SGX Enclave Control Structure (SECS) is a special enclave page that is not
- * visible in the address space. In fact, this structure defines the address
- * range and other global attributes for the enclave and it is the first EPC
- * page created for any enclave. It is moved from a temporary buffer to an EPC
- * by the means of ENCLS[ECREATE] function.
- */
-struct sgx_secs {
-       u64 size;
-       u64 base;
-       u32 ssa_frame_size;
-       u32 miscselect;
-       u8  reserved1[24];
-       u64 attributes;
-       u64 xfrm;
-       u32 mrenclave[8];
-       u8  reserved2[32];
-       u32 mrsigner[8];
-       u8  reserved3[32];
-       u32 config_id[16];
-       u16 isv_prod_id;
-       u16 isv_svn;
-       u16 config_svn;
-       u8  reserved4[3834];
-} __packed;
-
-/**
- * enum sgx_tcs_flags - execution flags for TCS
- * %SGX_TCS_DBGOPTIN:  If enabled allows single-stepping and breakpoints
- *                     inside an enclave. It is cleared by EADD but can
- *                     be set later with EDBGWR.
- */
-enum sgx_tcs_flags {
-       SGX_TCS_DBGOPTIN        = 0x01,
-};
-
-#define SGX_TCS_RESERVED_MASK  GENMASK_ULL(63, 1)
-#define SGX_TCS_RESERVED_SIZE  4024
-
-/**
- * struct sgx_tcs - Thread Control Structure (TCS)
- * @state:             used to mark an entered TCS
- * @flags:             execution flags (cleared by EADD)
- * @ssa_offset:                SSA stack offset relative to the enclave base
- * @ssa_index:         the current SSA frame index (cleard by EADD)
- * @nr_ssa_frames:     the number of frame in the SSA stack
- * @entry_offset:      entry point offset relative to the enclave base
- * @exit_addr:         address outside the enclave to exit on an exception or
- *                     interrupt
- * @fs_offset:         offset relative to the enclave base to become FS
- *                     segment inside the enclave
- * @gs_offset:         offset relative to the enclave base to become GS
- *                     segment inside the enclave
- * @fs_limit:          size to become a new FS-limit (only 32-bit enclaves)
- * @gs_limit:          size to become a new GS-limit (only 32-bit enclaves)
- *
- * Thread Control Structure (TCS) is an enclave page visible in its address
- * space that defines an entry point inside the enclave. A thread enters inside
- * an enclave by supplying address of TCS to ENCLU(EENTER). A TCS can be entered
- * by only one thread at a time.
- */
-struct sgx_tcs {
-       u64 state;
-       u64 flags;
-       u64 ssa_offset;
-       u32 ssa_index;
-       u32 nr_ssa_frames;
-       u64 entry_offset;
-       u64 exit_addr;
-       u64 fs_offset;
-       u64 gs_offset;
-       u32 fs_limit;
-       u32 gs_limit;
-       u8  reserved[SGX_TCS_RESERVED_SIZE];
-} __packed;
-
-/**
- * struct sgx_pageinfo - an enclave page descriptor
- * @addr:      address of the enclave page
- * @contents:  pointer to the page contents
- * @metadata:  pointer either to a SECINFO or PCMD instance
- * @secs:      address of the SECS page
- */
-struct sgx_pageinfo {
-       u64 addr;
-       u64 contents;
-       u64 metadata;
-       u64 secs;
-} __packed __aligned(32);
-
-
-/**
- * enum sgx_page_type - bits in the SECINFO flags defining the page type
- * %SGX_PAGE_TYPE_SECS:        a SECS page
- * %SGX_PAGE_TYPE_TCS: a TCS page
- * %SGX_PAGE_TYPE_REG: a regular page
- * %SGX_PAGE_TYPE_VA:  a VA page
- * %SGX_PAGE_TYPE_TRIM:        a page in trimmed state
- */
-enum sgx_page_type {
-       SGX_PAGE_TYPE_SECS,
-       SGX_PAGE_TYPE_TCS,
-       SGX_PAGE_TYPE_REG,
-       SGX_PAGE_TYPE_VA,
-       SGX_PAGE_TYPE_TRIM,
-};
-
-#define SGX_NR_PAGE_TYPES      5
-#define SGX_PAGE_TYPE_MASK     GENMASK(7, 0)
-
-/**
- * enum sgx_secinfo_flags - the flags field in &struct sgx_secinfo
- * %SGX_SECINFO_R:     allow read
- * %SGX_SECINFO_W:     allow write
- * %SGX_SECINFO_X:     allow execution
- * %SGX_SECINFO_SECS:  a SECS page
- * %SGX_SECINFO_TCS:   a TCS page
- * %SGX_SECINFO_REG:   a regular page
- * %SGX_SECINFO_VA:    a VA page
- * %SGX_SECINFO_TRIM:  a page in trimmed state
- */
-enum sgx_secinfo_flags {
-       SGX_SECINFO_R                   = BIT(0),
-       SGX_SECINFO_W                   = BIT(1),
-       SGX_SECINFO_X                   = BIT(2),
-       SGX_SECINFO_SECS                = (SGX_PAGE_TYPE_SECS << 8),
-       SGX_SECINFO_TCS                 = (SGX_PAGE_TYPE_TCS << 8),
-       SGX_SECINFO_REG                 = (SGX_PAGE_TYPE_REG << 8),
-       SGX_SECINFO_VA                  = (SGX_PAGE_TYPE_VA << 8),
-       SGX_SECINFO_TRIM                = (SGX_PAGE_TYPE_TRIM << 8),
-};
-
-#define SGX_SECINFO_PERMISSION_MASK    GENMASK_ULL(2, 0)
-#define SGX_SECINFO_PAGE_TYPE_MASK     (SGX_PAGE_TYPE_MASK << 8)
-#define SGX_SECINFO_RESERVED_MASK      ~(SGX_SECINFO_PERMISSION_MASK | \
-                                         SGX_SECINFO_PAGE_TYPE_MASK)
-
-/**
- * struct sgx_secinfo - describes attributes of an EPC page
- * @flags:     permissions and type
- *
- * Used together with ENCLS leaves that add or modify an EPC page to an
- * enclave to define page permissions and type.
- */
-struct sgx_secinfo {
-       u64 flags;
-       u8  reserved[56];
-} __packed __aligned(64);
-
-#define SGX_PCMD_RESERVED_SIZE 40
-
-/**
- * struct sgx_pcmd - Paging Crypto Metadata (PCMD)
- * @enclave_id:        enclave identifier
- * @mac:       MAC over PCMD, page contents and isvsvn
- *
- * PCMD is stored for every swapped page to the regular memory. When ELDU loads
- * the page back it recalculates the MAC by using a isvsvn number stored in a
- * VA page. Together these two structures bring integrity and rollback
- * protection.
- */
-struct sgx_pcmd {
-       struct sgx_secinfo secinfo;
-       u64 enclave_id;
-       u8  reserved[SGX_PCMD_RESERVED_SIZE];
-       u8  mac[16];
-} __packed __aligned(128);
-
-#define SGX_SIGSTRUCT_RESERVED1_SIZE 84
-#define SGX_SIGSTRUCT_RESERVED2_SIZE 20
-#define SGX_SIGSTRUCT_RESERVED3_SIZE 32
-#define SGX_SIGSTRUCT_RESERVED4_SIZE 12
-
-/**
- * struct sgx_sigstruct_header -  defines author of the enclave
- * @header1:           constant byte string
- * @vendor:            must be either 0x0000 or 0x8086
- * @date:              YYYYMMDD in BCD
- * @header2:           costant byte string
- * @swdefined:         software defined value
- */
-struct sgx_sigstruct_header {
-       u64 header1[2];
-       u32 vendor;
-       u32 date;
-       u64 header2[2];
-       u32 swdefined;
-       u8  reserved1[84];
-} __packed;
-
-/**
- * struct sgx_sigstruct_body - defines contents of the enclave
- * @miscselect:                additional information stored to an SSA frame
- * @misc_mask:         required miscselect in SECS
- * @attributes:                attributes for enclave
- * @xfrm:              XSave-Feature Request Mask (subset of XCR0)
- * @attributes_mask:   required attributes in SECS
- * @xfrm_mask:         required XFRM in SECS
- * @mrenclave:         SHA256-hash of the enclave contents
- * @isvprodid:         a user-defined value that is used in key derivation
- * @isvsvn:            a user-defined value that is used in key derivation
- */
-struct sgx_sigstruct_body {
-       u32 miscselect;
-       u32 misc_mask;
-       u8  reserved2[20];
-       u64 attributes;
-       u64 xfrm;
-       u64 attributes_mask;
-       u64 xfrm_mask;
-       u8  mrenclave[32];
-       u8  reserved3[32];
-       u16 isvprodid;
-       u16 isvsvn;
-} __packed;
-
-/**
- * struct sgx_sigstruct - an enclave signature
- * @header:            defines author of the enclave
- * @modulus:           the modulus of the public key
- * @exponent:          the exponent of the public key
- * @signature:         the signature calculated over the fields except modulus,
- * @body:              defines contents of the enclave
- * @q1:                        a value used in RSA signature verification
- * @q2:                        a value used in RSA signature verification
- *
- * Header and body are the parts that are actual signed. The remaining fields
- * define the signature of the enclave.
- */
-struct sgx_sigstruct {
-       struct sgx_sigstruct_header header;
-       u8  modulus[SGX_MODULUS_SIZE];
-       u32 exponent;
-       u8  signature[SGX_MODULUS_SIZE];
-       struct sgx_sigstruct_body body;
-       u8  reserved4[12];
-       u8  q1[SGX_MODULUS_SIZE];
-       u8  q2[SGX_MODULUS_SIZE];
-} __packed;
-
-#define SGX_LAUNCH_TOKEN_SIZE 304
-
-#endif /* _ASM_X86_SGX_ARCH_H */
index 8ce6d83..aa9b8b8 100644 (file)
@@ -136,10 +136,6 @@ static const struct file_operations sgx_encl_fops = {
        .get_unmapped_area      = sgx_get_unmapped_area,
 };
 
-const struct file_operations sgx_provision_fops = {
-       .owner                  = THIS_MODULE,
-};
-
 static struct miscdevice sgx_dev_enclave = {
        .minor = MISC_DYNAMIC_MINOR,
        .name = "sgx_enclave",
@@ -147,13 +143,6 @@ static struct miscdevice sgx_dev_enclave = {
        .fops = &sgx_encl_fops,
 };
 
-static struct miscdevice sgx_dev_provision = {
-       .minor = MISC_DYNAMIC_MINOR,
-       .name = "sgx_provision",
-       .nodename = "sgx_provision",
-       .fops = &sgx_provision_fops,
-};
-
 int __init sgx_drv_init(void)
 {
        unsigned int eax, ebx, ecx, edx;
@@ -187,11 +176,5 @@ int __init sgx_drv_init(void)
        if (ret)
                return ret;
 
-       ret = misc_register(&sgx_dev_provision);
-       if (ret) {
-               misc_deregister(&sgx_dev_enclave);
-               return ret;
-       }
-
        return 0;
 }
index 7449ef3..3be2032 100644 (file)
@@ -7,7 +7,7 @@
 #include <linux/shmem_fs.h>
 #include <linux/suspend.h>
 #include <linux/sched/mm.h>
-#include "arch.h"
+#include <asm/sgx.h>
 #include "encl.h"
 #include "encls.h"
 #include "sgx.h"
@@ -78,7 +78,7 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
 
        ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
        if (ret) {
-               sgx_free_epc_page(epc_page);
+               sgx_encl_free_epc_page(epc_page);
                return ERR_PTR(ret);
        }
 
@@ -404,7 +404,7 @@ void sgx_encl_release(struct kref *ref)
                        if (sgx_unmark_page_reclaimable(entry->epc_page))
                                continue;
 
-                       sgx_free_epc_page(entry->epc_page);
+                       sgx_encl_free_epc_page(entry->epc_page);
                        encl->secs_child_cnt--;
                        entry->epc_page = NULL;
                }
@@ -415,7 +415,7 @@ void sgx_encl_release(struct kref *ref)
        xa_destroy(&encl->page_array);
 
        if (!encl->secs_child_cnt && encl->secs.epc_page) {
-               sgx_free_epc_page(encl->secs.epc_page);
+               sgx_encl_free_epc_page(encl->secs.epc_page);
                encl->secs.epc_page = NULL;
        }
 
@@ -423,7 +423,7 @@ void sgx_encl_release(struct kref *ref)
                va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
                                           list);
                list_del(&va_page->list);
-               sgx_free_epc_page(va_page->epc_page);
+               sgx_encl_free_epc_page(va_page->epc_page);
                kfree(va_page);
        }
 
@@ -686,7 +686,7 @@ struct sgx_epc_page *sgx_alloc_va_page(void)
        ret = __epa(sgx_get_epc_virt_addr(epc_page));
        if (ret) {
                WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
-               sgx_free_epc_page(epc_page);
+               sgx_encl_free_epc_page(epc_page);
                return ERR_PTR(-EFAULT);
        }
 
@@ -735,3 +735,24 @@ bool sgx_va_page_full(struct sgx_va_page *va_page)
 
        return slot == SGX_VA_SLOT_COUNT;
 }
+
+/**
+ * sgx_encl_free_epc_page - free an EPC page assigned to an enclave
+ * @page:      EPC page to be freed
+ *
+ * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and
+ * only upon success, it puts the page back to free page list.  Otherwise, it
+ * gives a WARNING to indicate page is leaked.
+ */
+void sgx_encl_free_epc_page(struct sgx_epc_page *page)
+{
+       int ret;
+
+       WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+
+       ret = __eremove(sgx_get_epc_virt_addr(page));
+       if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret))
+               return;
+
+       sgx_free_epc_page(page);
+}
index d8d30cc..6e74f85 100644 (file)
@@ -115,5 +115,6 @@ struct sgx_epc_page *sgx_alloc_va_page(void);
 unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page);
 void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset);
 bool sgx_va_page_full(struct sgx_va_page *va_page);
+void sgx_encl_free_epc_page(struct sgx_epc_page *page);
 
 #endif /* _X86_ENCL_H */
index 443188f..9b20484 100644 (file)
 #include <asm/traps.h>
 #include "sgx.h"
 
-enum sgx_encls_function {
-       ECREATE = 0x00,
-       EADD    = 0x01,
-       EINIT   = 0x02,
-       EREMOVE = 0x03,
-       EDGBRD  = 0x04,
-       EDGBWR  = 0x05,
-       EEXTEND = 0x06,
-       ELDU    = 0x08,
-       EBLOCK  = 0x09,
-       EPA     = 0x0A,
-       EWB     = 0x0B,
-       ETRACK  = 0x0C,
-};
-
 /**
  * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr
  *
@@ -55,6 +40,19 @@ enum sgx_encls_function {
        } while (0);                                                      \
 }
 
+/*
+ * encls_faulted() - Check if an ENCLS leaf faulted given an error code
+ * @ret:       the return value of an ENCLS leaf function call
+ *
+ * Return:
+ * - true:     ENCLS leaf faulted.
+ * - false:    Otherwise.
+ */
+static inline bool encls_faulted(int ret)
+{
+       return ret & ENCLS_FAULT_FLAG;
+}
+
 /**
  * encls_failed() - Check if an ENCLS function failed
  * @ret:       the return value of an ENCLS function call
@@ -65,7 +63,7 @@ enum sgx_encls_function {
  */
 static inline bool encls_failed(int ret)
 {
-       if (ret & ENCLS_FAULT_FLAG)
+       if (encls_faulted(ret))
                return ENCLS_TRAPNR(ret) != X86_TRAP_PF;
 
        return !!ret;
index 90a5caf..83df20e 100644 (file)
@@ -2,6 +2,7 @@
 /*  Copyright(c) 2016-20 Intel Corporation. */
 
 #include <asm/mman.h>
+#include <asm/sgx.h>
 #include <linux/mman.h>
 #include <linux/delay.h>
 #include <linux/file.h>
@@ -47,7 +48,7 @@ static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page)
        encl->page_cnt--;
 
        if (va_page) {
-               sgx_free_epc_page(va_page->epc_page);
+               sgx_encl_free_epc_page(va_page->epc_page);
                list_del(&va_page->list);
                kfree(va_page);
        }
@@ -117,7 +118,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
        return 0;
 
 err_out:
-       sgx_free_epc_page(encl->secs.epc_page);
+       sgx_encl_free_epc_page(encl->secs.epc_page);
        encl->secs.epc_page = NULL;
 
 err_out_backing:
@@ -365,7 +366,7 @@ err_out_unlock:
        mmap_read_unlock(current->mm);
 
 err_out_free:
-       sgx_free_epc_page(epc_page);
+       sgx_encl_free_epc_page(epc_page);
        kfree(encl_page);
 
        return ret;
@@ -495,7 +496,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
                         void *token)
 {
        u64 mrsigner[4];
-       int i, j, k;
+       int i, j;
        void *addr;
        int ret;
 
@@ -544,8 +545,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
 
                        preempt_disable();
 
-                       for (k = 0; k < 4; k++)
-                               wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]);
+                       sgx_update_lepubkeyhash(mrsigner);
 
                        ret = __einit(sigstruct, token, addr);
 
@@ -568,7 +568,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
                }
        }
 
-       if (ret & ENCLS_FAULT_FLAG) {
+       if (encls_faulted(ret)) {
                if (encls_failed(ret))
                        ENCLS_WARN(ret, "EINIT");
 
@@ -604,7 +604,6 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
 {
        struct sgx_sigstruct *sigstruct;
        struct sgx_enclave_init init_arg;
-       struct page *initp_page;
        void *token;
        int ret;
 
@@ -615,11 +614,15 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
        if (copy_from_user(&init_arg, arg, sizeof(init_arg)))
                return -EFAULT;
 
-       initp_page = alloc_page(GFP_KERNEL);
-       if (!initp_page)
+       /*
+        * 'sigstruct' must be on a page boundary and 'token' on a 512 byte
+        * boundary.  kmalloc() will give this alignment when allocating
+        * PAGE_SIZE bytes.
+        */
+       sigstruct = kmalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!sigstruct)
                return -ENOMEM;
 
-       sigstruct = kmap(initp_page);
        token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2);
        memset(token, 0, SGX_LAUNCH_TOKEN_SIZE);
 
@@ -645,8 +648,7 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
        ret = sgx_encl_init(encl, sigstruct, token);
 
 out:
-       kunmap(initp_page);
-       __free_page(initp_page);
+       kfree(sigstruct);
        return ret;
 }
 
@@ -665,24 +667,11 @@ out:
 static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg)
 {
        struct sgx_enclave_provision params;
-       struct file *file;
 
        if (copy_from_user(&params, arg, sizeof(params)))
                return -EFAULT;
 
-       file = fget(params.fd);
-       if (!file)
-               return -EINVAL;
-
-       if (file->f_op != &sgx_provision_fops) {
-               fput(file);
-               return -EINVAL;
-       }
-
-       encl->attributes_mask |= SGX_ATTR_PROVISIONKEY;
-
-       fput(file);
-       return 0;
+       return sgx_set_attribute(&encl->attributes_mask, params.fd);
 }
 
 long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
index 8df81a3..ad90474 100644 (file)
@@ -1,14 +1,17 @@
 // SPDX-License-Identifier: GPL-2.0
 /*  Copyright(c) 2016-20 Intel Corporation. */
 
+#include <linux/file.h>
 #include <linux/freezer.h>
 #include <linux/highmem.h>
 #include <linux/kthread.h>
+#include <linux/miscdevice.h>
 #include <linux/pagemap.h>
 #include <linux/ratelimit.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/signal.h>
 #include <linux/slab.h>
+#include <asm/sgx.h>
 #include "driver.h"
 #include "encl.h"
 #include "encls.h"
@@ -23,42 +26,58 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
  * with sgx_reclaimer_lock acquired.
  */
 static LIST_HEAD(sgx_active_page_list);
-
 static DEFINE_SPINLOCK(sgx_reclaimer_lock);
 
+/* The free page list lock protected variables prepend the lock. */
+static unsigned long sgx_nr_free_pages;
+
+/* Nodes with one or more EPC sections. */
+static nodemask_t sgx_numa_mask;
+
+/*
+ * Array with one list_head for each possible NUMA node.  Each
+ * list contains all the sgx_epc_section's which are on that
+ * node.
+ */
+static struct sgx_numa_node *sgx_numa_nodes;
+
+static LIST_HEAD(sgx_dirty_page_list);
+
 /*
- * Reset dirty EPC pages to uninitialized state. Laundry can be left with SECS
- * pages whose child pages blocked EREMOVE.
+ * Reset post-kexec EPC pages to the uninitialized state. The pages are removed
+ * from the input list, and made available for the page allocator. SECS pages
+ * prepending their children in the input list are left intact.
  */
-static void sgx_sanitize_section(struct sgx_epc_section *section)
+static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
 {
        struct sgx_epc_page *page;
        LIST_HEAD(dirty);
        int ret;
 
-       /* init_laundry_list is thread-local, no need for a lock: */
-       while (!list_empty(&section->init_laundry_list)) {
+       /* dirty_page_list is thread-local, no need for a lock: */
+       while (!list_empty(dirty_page_list)) {
                if (kthread_should_stop())
                        return;
 
-               /* needed for access to ->page_list: */
-               spin_lock(&section->lock);
-
-               page = list_first_entry(&section->init_laundry_list,
-                                       struct sgx_epc_page, list);
+               page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
 
                ret = __eremove(sgx_get_epc_virt_addr(page));
-               if (!ret)
-                       list_move(&page->list, &section->page_list);
-               else
+               if (!ret) {
+                       /*
+                        * page is now sanitized.  Make it available via the SGX
+                        * page allocator:
+                        */
+                       list_del(&page->list);
+                       sgx_free_epc_page(page);
+               } else {
+                       /* The page is not yet clean - move to the dirty list. */
                        list_move_tail(&page->list, &dirty);
-
-               spin_unlock(&section->lock);
+               }
 
                cond_resched();
        }
 
-       list_splice(&dirty, &section->init_laundry_list);
+       list_splice(&dirty, dirty_page_list);
 }
 
 static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
@@ -278,7 +297,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
 
                sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
 
-               sgx_free_epc_page(encl->secs.epc_page);
+               sgx_encl_free_epc_page(encl->secs.epc_page);
                encl->secs.epc_page = NULL;
 
                sgx_encl_put_backing(&secs_backing, true);
@@ -308,6 +327,7 @@ static void sgx_reclaim_pages(void)
        struct sgx_epc_section *section;
        struct sgx_encl_page *encl_page;
        struct sgx_epc_page *epc_page;
+       struct sgx_numa_node *node;
        pgoff_t page_index;
        int cnt = 0;
        int ret;
@@ -379,50 +399,33 @@ skip:
                epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
 
                section = &sgx_epc_sections[epc_page->section];
-               spin_lock(&section->lock);
-               list_add_tail(&epc_page->list, &section->page_list);
-               section->free_cnt++;
-               spin_unlock(&section->lock);
-       }
-}
-
-static unsigned long sgx_nr_free_pages(void)
-{
-       unsigned long cnt = 0;
-       int i;
-
-       for (i = 0; i < sgx_nr_epc_sections; i++)
-               cnt += sgx_epc_sections[i].free_cnt;
+               node = section->node;
 
-       return cnt;
+               spin_lock(&node->lock);
+               list_add_tail(&epc_page->list, &node->free_page_list);
+               sgx_nr_free_pages++;
+               spin_unlock(&node->lock);
+       }
 }
 
 static bool sgx_should_reclaim(unsigned long watermark)
 {
-       return sgx_nr_free_pages() < watermark &&
-              !list_empty(&sgx_active_page_list);
+       return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list);
 }
 
 static int ksgxd(void *p)
 {
-       int i;
-
        set_freezable();
 
        /*
         * Sanitize pages in order to recover from kexec(). The 2nd pass is
         * required for SECS pages, whose child pages blocked EREMOVE.
         */
-       for (i = 0; i < sgx_nr_epc_sections; i++)
-               sgx_sanitize_section(&sgx_epc_sections[i]);
-
-       for (i = 0; i < sgx_nr_epc_sections; i++) {
-               sgx_sanitize_section(&sgx_epc_sections[i]);
+       __sgx_sanitize_pages(&sgx_dirty_page_list);
+       __sgx_sanitize_pages(&sgx_dirty_page_list);
 
-               /* Should never happen. */
-               if (!list_empty(&sgx_epc_sections[i].init_laundry_list))
-                       WARN(1, "EPC section %d has unsanitized pages.\n", i);
-       }
+       /* sanity check: */
+       WARN_ON(!list_empty(&sgx_dirty_page_list));
 
        while (!kthread_should_stop()) {
                if (try_to_freeze())
@@ -454,45 +457,56 @@ static bool __init sgx_page_reclaimer_init(void)
        return true;
 }
 
-static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section)
+static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
 {
-       struct sgx_epc_page *page;
+       struct sgx_numa_node *node = &sgx_numa_nodes[nid];
+       struct sgx_epc_page *page = NULL;
 
-       spin_lock(&section->lock);
+       spin_lock(&node->lock);
 
-       if (list_empty(&section->page_list)) {
-               spin_unlock(&section->lock);
+       if (list_empty(&node->free_page_list)) {
+               spin_unlock(&node->lock);
                return NULL;
        }
 
-       page = list_first_entry(&section->page_list, struct sgx_epc_page, list);
+       page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
        list_del_init(&page->list);
-       section->free_cnt--;
+       sgx_nr_free_pages--;
+
+       spin_unlock(&node->lock);
 
-       spin_unlock(&section->lock);
        return page;
 }
 
 /**
  * __sgx_alloc_epc_page() - Allocate an EPC page
  *
- * Iterate through EPC sections and borrow a free EPC page to the caller. When a
- * page is no longer needed it must be released with sgx_free_epc_page().
+ * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start
+ * from the NUMA node, where the caller is executing.
  *
  * Return:
- *   an EPC page,
- *   -errno on error
+ * - an EPC page:      A borrowed EPC pages were available.
+ * - NULL:             Out of EPC pages.
  */
 struct sgx_epc_page *__sgx_alloc_epc_page(void)
 {
-       struct sgx_epc_section *section;
        struct sgx_epc_page *page;
-       int i;
+       int nid_of_current = numa_node_id();
+       int nid = nid_of_current;
 
-       for (i = 0; i < sgx_nr_epc_sections; i++) {
-               section = &sgx_epc_sections[i];
+       if (node_isset(nid_of_current, sgx_numa_mask)) {
+               page = __sgx_alloc_epc_page_from_node(nid_of_current);
+               if (page)
+                       return page;
+       }
+
+       /* Fall back to the non-local NUMA nodes: */
+       while (true) {
+               nid = next_node_in(nid, sgx_numa_mask);
+               if (nid == nid_of_current)
+                       break;
 
-               page = __sgx_alloc_epc_page_from_section(section);
+               page = __sgx_alloc_epc_page_from_node(nid);
                if (page)
                        return page;
        }
@@ -598,23 +612,22 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
  * sgx_free_epc_page() - Free an EPC page
  * @page:      an EPC page
  *
- * Call EREMOVE for an EPC page and insert it back to the list of free pages.
+ * Put the EPC page back to the list of free pages. It's the caller's
+ * responsibility to make sure that the page is in uninitialized state. In other
+ * words, do EREMOVE, EWB or whatever operation is necessary before calling
+ * this function.
  */
 void sgx_free_epc_page(struct sgx_epc_page *page)
 {
        struct sgx_epc_section *section = &sgx_epc_sections[page->section];
-       int ret;
+       struct sgx_numa_node *node = section->node;
 
-       WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+       spin_lock(&node->lock);
 
-       ret = __eremove(sgx_get_epc_virt_addr(page));
-       if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret))
-               return;
+       list_add_tail(&page->list, &node->free_page_list);
+       sgx_nr_free_pages++;
 
-       spin_lock(&section->lock);
-       list_add_tail(&page->list, &section->page_list);
-       section->free_cnt++;
-       spin_unlock(&section->lock);
+       spin_unlock(&node->lock);
 }
 
 static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
@@ -635,18 +648,14 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
        }
 
        section->phys_addr = phys_addr;
-       spin_lock_init(&section->lock);
-       INIT_LIST_HEAD(&section->page_list);
-       INIT_LIST_HEAD(&section->init_laundry_list);
 
        for (i = 0; i < nr_pages; i++) {
                section->pages[i].section = index;
                section->pages[i].flags = 0;
                section->pages[i].owner = NULL;
-               list_add_tail(&section->pages[i].list, &section->init_laundry_list);
+               list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
        }
 
-       section->free_cnt = nr_pages;
        return true;
 }
 
@@ -665,8 +674,13 @@ static bool __init sgx_page_cache_init(void)
 {
        u32 eax, ebx, ecx, edx, type;
        u64 pa, size;
+       int nid;
        int i;
 
+       sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
+       if (!sgx_numa_nodes)
+               return false;
+
        for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
                cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
 
@@ -689,6 +703,21 @@ static bool __init sgx_page_cache_init(void)
                        break;
                }
 
+               nid = numa_map_to_online_node(phys_to_target_node(pa));
+               if (nid == NUMA_NO_NODE) {
+                       /* The physical address is already printed above. */
+                       pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
+                       nid = 0;
+               }
+
+               if (!node_isset(nid, sgx_numa_mask)) {
+                       spin_lock_init(&sgx_numa_nodes[nid].lock);
+                       INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
+                       node_set(nid, sgx_numa_mask);
+               }
+
+               sgx_epc_sections[i].node =  &sgx_numa_nodes[nid];
+
                sgx_nr_epc_sections++;
        }
 
@@ -700,6 +729,67 @@ static bool __init sgx_page_cache_init(void)
        return true;
 }
 
+/*
+ * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
+ * Bare-metal driver requires to update them to hash of enclave's signer
+ * before EINIT. KVM needs to update them to guest's virtual MSR values
+ * before doing EINIT from guest.
+ */
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
+{
+       int i;
+
+       WARN_ON_ONCE(preemptible());
+
+       for (i = 0; i < 4; i++)
+               wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
+}
+
+const struct file_operations sgx_provision_fops = {
+       .owner                  = THIS_MODULE,
+};
+
+static struct miscdevice sgx_dev_provision = {
+       .minor = MISC_DYNAMIC_MINOR,
+       .name = "sgx_provision",
+       .nodename = "sgx_provision",
+       .fops = &sgx_provision_fops,
+};
+
+/**
+ * sgx_set_attribute() - Update allowed attributes given file descriptor
+ * @allowed_attributes:                Pointer to allowed enclave attributes
+ * @attribute_fd:              File descriptor for specific attribute
+ *
+ * Append enclave attribute indicated by file descriptor to allowed
+ * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by
+ * /dev/sgx_provision is supported.
+ *
+ * Return:
+ * -0:         SGX_ATTR_PROVISIONKEY is appended to allowed_attributes
+ * -EINVAL:    Invalid, or not supported file descriptor
+ */
+int sgx_set_attribute(unsigned long *allowed_attributes,
+                     unsigned int attribute_fd)
+{
+       struct file *file;
+
+       file = fget(attribute_fd);
+       if (!file)
+               return -EINVAL;
+
+       if (file->f_op != &sgx_provision_fops) {
+               fput(file);
+               return -EINVAL;
+       }
+
+       *allowed_attributes |= SGX_ATTR_PROVISIONKEY;
+
+       fput(file);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(sgx_set_attribute);
+
 static int __init sgx_init(void)
 {
        int ret;
@@ -716,12 +806,28 @@ static int __init sgx_init(void)
                goto err_page_cache;
        }
 
-       ret = sgx_drv_init();
+       ret = misc_register(&sgx_dev_provision);
        if (ret)
                goto err_kthread;
 
+       /*
+        * Always try to initialize the native *and* KVM drivers.
+        * The KVM driver is less picky than the native one and
+        * can function if the native one is not supported on the
+        * current system or fails to initialize.
+        *
+        * Error out only if both fail to initialize.
+        */
+       ret = sgx_drv_init();
+
+       if (sgx_vepc_init() && ret)
+               goto err_provision;
+
        return 0;
 
+err_provision:
+       misc_deregister(&sgx_dev_provision);
+
 err_kthread:
        kthread_stop(ksgxd_tsk);
 
index 5fa42d1..4628ace 100644 (file)
@@ -8,11 +8,15 @@
 #include <linux/rwsem.h>
 #include <linux/types.h>
 #include <asm/asm.h>
-#include "arch.h"
+#include <asm/sgx.h>
 
 #undef pr_fmt
 #define pr_fmt(fmt) "sgx: " fmt
 
+#define EREMOVE_ERROR_MESSAGE \
+       "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \
+       "Refer to Documentation/x86/sgx.rst for more information."
+
 #define SGX_MAX_EPC_SECTIONS           8
 #define SGX_EEXTEND_BLOCK_SIZE         256
 #define SGX_NR_TO_SCAN                 16
@@ -29,29 +33,26 @@ struct sgx_epc_page {
        struct list_head list;
 };
 
+/*
+ * Contains the tracking data for NUMA nodes having EPC pages. Most importantly,
+ * the free page list local to the node is stored here.
+ */
+struct sgx_numa_node {
+       struct list_head free_page_list;
+       spinlock_t lock;
+};
+
 /*
  * The firmware can define multiple chunks of EPC to the different areas of the
  * physical memory e.g. for memory areas of the each node. This structure is
  * used to store EPC pages for one EPC section and virtual memory area where
  * the pages have been mapped.
- *
- * 'lock' must be held before accessing 'page_list' or 'free_cnt'.
  */
 struct sgx_epc_section {
        unsigned long phys_addr;
        void *virt_addr;
        struct sgx_epc_page *pages;
-
-       spinlock_t lock;
-       struct list_head page_list;
-       unsigned long free_cnt;
-
-       /*
-        * Pages which need EREMOVE run on them before they can be
-        * used.  Only safe to be accessed in ksgxd and init code.
-        * Not protected by locks.
-        */
-       struct list_head init_laundry_list;
+       struct sgx_numa_node *node;
 };
 
 extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
@@ -83,4 +84,15 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page);
 int sgx_unmark_page_reclaimable(struct sgx_epc_page *page);
 struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim);
 
+#ifdef CONFIG_X86_SGX_KVM
+int __init sgx_vepc_init(void);
+#else
+static inline int __init sgx_vepc_init(void)
+{
+       return -ENODEV;
+}
+#endif
+
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash);
+
 #endif /* _X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
new file mode 100644 (file)
index 0000000..6ad165a
--- /dev/null
@@ -0,0 +1,376 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Device driver to expose SGX enclave memory to KVM guests.
+ *
+ * Copyright(c) 2021 Intel Corporation.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+#include <asm/sgx.h>
+#include <uapi/asm/sgx.h>
+
+#include "encls.h"
+#include "sgx.h"
+
+struct sgx_vepc {
+       struct xarray page_array;
+       struct mutex lock;
+};
+
+/*
+ * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
+ * virtual EPC instances, and the lock to protect it.
+ */
+static struct mutex zombie_secs_pages_lock;
+static struct list_head zombie_secs_pages;
+
+static int __sgx_vepc_fault(struct sgx_vepc *vepc,
+                           struct vm_area_struct *vma, unsigned long addr)
+{
+       struct sgx_epc_page *epc_page;
+       unsigned long index, pfn;
+       int ret;
+
+       WARN_ON(!mutex_is_locked(&vepc->lock));
+
+       /* Calculate index of EPC page in virtual EPC's page_array */
+       index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
+
+       epc_page = xa_load(&vepc->page_array, index);
+       if (epc_page)
+               return 0;
+
+       epc_page = sgx_alloc_epc_page(vepc, false);
+       if (IS_ERR(epc_page))
+               return PTR_ERR(epc_page);
+
+       ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
+       if (ret)
+               goto err_free;
+
+       pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
+
+       ret = vmf_insert_pfn(vma, addr, pfn);
+       if (ret != VM_FAULT_NOPAGE) {
+               ret = -EFAULT;
+               goto err_delete;
+       }
+
+       return 0;
+
+err_delete:
+       xa_erase(&vepc->page_array, index);
+err_free:
+       sgx_free_epc_page(epc_page);
+       return ret;
+}
+
+static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
+{
+       struct vm_area_struct *vma = vmf->vma;
+       struct sgx_vepc *vepc = vma->vm_private_data;
+       int ret;
+
+       mutex_lock(&vepc->lock);
+       ret = __sgx_vepc_fault(vepc, vma, vmf->address);
+       mutex_unlock(&vepc->lock);
+
+       if (!ret)
+               return VM_FAULT_NOPAGE;
+
+       if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
+               mmap_read_unlock(vma->vm_mm);
+               return VM_FAULT_RETRY;
+       }
+
+       return VM_FAULT_SIGBUS;
+}
+
+static const struct vm_operations_struct sgx_vepc_vm_ops = {
+       .fault = sgx_vepc_fault,
+};
+
+static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       struct sgx_vepc *vepc = file->private_data;
+
+       if (!(vma->vm_flags & VM_SHARED))
+               return -EINVAL;
+
+       vma->vm_ops = &sgx_vepc_vm_ops;
+       /* Don't copy VMA in fork() */
+       vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
+       vma->vm_private_data = vepc;
+
+       return 0;
+}
+
+static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+{
+       int ret;
+
+       /*
+        * Take a previously guest-owned EPC page and return it to the
+        * general EPC page pool.
+        *
+        * Guests can not be trusted to have left this page in a good
+        * state, so run EREMOVE on the page unconditionally.  In the
+        * case that a guest properly EREMOVE'd this page, a superfluous
+        * EREMOVE is harmless.
+        */
+       ret = __eremove(sgx_get_epc_virt_addr(epc_page));
+       if (ret) {
+               /*
+                * Only SGX_CHILD_PRESENT is expected, which is because of
+                * EREMOVE'ing an SECS still with child, in which case it can
+                * be handled by EREMOVE'ing the SECS again after all pages in
+                * virtual EPC have been EREMOVE'd. See comments in below in
+                * sgx_vepc_release().
+                *
+                * The user of virtual EPC (KVM) needs to guarantee there's no
+                * logical processor is still running in the enclave in guest,
+                * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
+                * handled here.
+                */
+               WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
+                         ret, ret);
+               return ret;
+       }
+
+       sgx_free_epc_page(epc_page);
+
+       return 0;
+}
+
+static int sgx_vepc_release(struct inode *inode, struct file *file)
+{
+       struct sgx_vepc *vepc = file->private_data;
+       struct sgx_epc_page *epc_page, *tmp, *entry;
+       unsigned long index;
+
+       LIST_HEAD(secs_pages);
+
+       xa_for_each(&vepc->page_array, index, entry) {
+               /*
+                * Remove all normal, child pages.  sgx_vepc_free_page()
+                * will fail if EREMOVE fails, but this is OK and expected on
+                * SECS pages.  Those can only be EREMOVE'd *after* all their
+                * child pages. Retries below will clean them up.
+                */
+               if (sgx_vepc_free_page(entry))
+                       continue;
+
+               xa_erase(&vepc->page_array, index);
+       }
+
+       /*
+        * Retry EREMOVE'ing pages.  This will clean up any SECS pages that
+        * only had children in this 'epc' area.
+        */
+       xa_for_each(&vepc->page_array, index, entry) {
+               epc_page = entry;
+               /*
+                * An EREMOVE failure here means that the SECS page still
+                * has children.  But, since all children in this 'sgx_vepc'
+                * have been removed, the SECS page must have a child on
+                * another instance.
+                */
+               if (sgx_vepc_free_page(epc_page))
+                       list_add_tail(&epc_page->list, &secs_pages);
+
+               xa_erase(&vepc->page_array, index);
+       }
+
+       /*
+        * SECS pages are "pinned" by child pages, and "unpinned" once all
+        * children have been EREMOVE'd.  A child page in this instance
+        * may have pinned an SECS page encountered in an earlier release(),
+        * creating a zombie.  Since some children were EREMOVE'd above,
+        * try to EREMOVE all zombies in the hopes that one was unpinned.
+        */
+       mutex_lock(&zombie_secs_pages_lock);
+       list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
+               /*
+                * Speculatively remove the page from the list of zombies,
+                * if the page is successfully EREMOVE'd it will be added to
+                * the list of free pages.  If EREMOVE fails, throw the page
+                * on the local list, which will be spliced on at the end.
+                */
+               list_del(&epc_page->list);
+
+               if (sgx_vepc_free_page(epc_page))
+                       list_add_tail(&epc_page->list, &secs_pages);
+       }
+
+       if (!list_empty(&secs_pages))
+               list_splice_tail(&secs_pages, &zombie_secs_pages);
+       mutex_unlock(&zombie_secs_pages_lock);
+
+       kfree(vepc);
+
+       return 0;
+}
+
+static int sgx_vepc_open(struct inode *inode, struct file *file)
+{
+       struct sgx_vepc *vepc;
+
+       vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);
+       if (!vepc)
+               return -ENOMEM;
+       mutex_init(&vepc->lock);
+       xa_init(&vepc->page_array);
+
+       file->private_data = vepc;
+
+       return 0;
+}
+
+static const struct file_operations sgx_vepc_fops = {
+       .owner          = THIS_MODULE,
+       .open           = sgx_vepc_open,
+       .release        = sgx_vepc_release,
+       .mmap           = sgx_vepc_mmap,
+};
+
+static struct miscdevice sgx_vepc_dev = {
+       .minor          = MISC_DYNAMIC_MINOR,
+       .name           = "sgx_vepc",
+       .nodename       = "sgx_vepc",
+       .fops           = &sgx_vepc_fops,
+};
+
+int __init sgx_vepc_init(void)
+{
+       /* SGX virtualization requires KVM to work */
+       if (!cpu_feature_enabled(X86_FEATURE_VMX))
+               return -ENODEV;
+
+       INIT_LIST_HEAD(&zombie_secs_pages);
+       mutex_init(&zombie_secs_pages_lock);
+
+       return misc_register(&sgx_vepc_dev);
+}
+
+/**
+ * sgx_virt_ecreate() - Run ECREATE on behalf of guest
+ * @pageinfo:  Pointer to PAGEINFO structure
+ * @secs:      Userspace pointer to SECS page
+ * @trapnr:    trap number injected to guest in case of ECREATE error
+ *
+ * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
+ * of enforcing policies of guest's enclaves, and return the trap number
+ * which should be injected to guest in case of any ECREATE error.
+ *
+ * Return:
+ * -  0:       ECREATE was successful.
+ * - <0:       on error.
+ */
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+                    int *trapnr)
+{
+       int ret;
+
+       /*
+        * @secs is an untrusted, userspace-provided address.  It comes from
+        * KVM and is assumed to be a valid pointer which points somewhere in
+        * userspace.  This can fault and call SGX or other fault handlers when
+        * userspace mapping @secs doesn't exist.
+        *
+        * Add a WARN() to make sure @secs is already valid userspace pointer
+        * from caller (KVM), who should already have handled invalid pointer
+        * case (for instance, made by malicious guest).  All other checks,
+        * such as alignment of @secs, are deferred to ENCLS itself.
+        */
+       if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
+               return -EINVAL;
+
+       __uaccess_begin();
+       ret = __ecreate(pageinfo, (void *)secs);
+       __uaccess_end();
+
+       if (encls_faulted(ret)) {
+               *trapnr = ENCLS_TRAPNR(ret);
+               return -EFAULT;
+       }
+
+       /* ECREATE doesn't return an error code, it faults or succeeds. */
+       WARN_ON_ONCE(ret);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
+
+static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
+                           void __user *secs)
+{
+       int ret;
+
+       /*
+        * Make sure all userspace pointers from caller (KVM) are valid.
+        * All other checks deferred to ENCLS itself.  Also see comment
+        * for @secs in sgx_virt_ecreate().
+        */
+#define SGX_EINITTOKEN_SIZE    304
+       if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
+                        !access_ok(token, SGX_EINITTOKEN_SIZE) ||
+                        !access_ok(secs, PAGE_SIZE)))
+               return -EINVAL;
+
+       __uaccess_begin();
+       ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
+       __uaccess_end();
+
+       return ret;
+}
+
+/**
+ * sgx_virt_einit() - Run EINIT on behalf of guest
+ * @sigstruct:         Userspace pointer to SIGSTRUCT structure
+ * @token:             Userspace pointer to EINITTOKEN structure
+ * @secs:              Userspace pointer to SECS page
+ * @lepubkeyhash:      Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
+ * @trapnr:            trap number injected to guest in case of EINIT error
+ *
+ * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
+ * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
+ * needs to update hardware values to guest's virtual MSR values in order to
+ * ensure EINIT is executed with expected hardware values.
+ *
+ * Return:
+ * -  0:       EINIT was successful.
+ * - <0:       on error.
+ */
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+                  void __user *secs, u64 *lepubkeyhash, int *trapnr)
+{
+       int ret;
+
+       if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+               ret = __sgx_virt_einit(sigstruct, token, secs);
+       } else {
+               preempt_disable();
+
+               sgx_update_lepubkeyhash(lepubkeyhash);
+
+               ret = __sgx_virt_einit(sigstruct, token, secs);
+               preempt_enable();
+       }
+
+       /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
+       if (ret == -EINVAL)
+               return ret;
+
+       if (encls_faulted(ret)) {
+               *trapnr = ENCLS_TRAPNR(ret);
+               return -EFAULT;
+       }
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(sgx_virt_einit);
index 78bb0fa..bd01a61 100644 (file)
@@ -451,6 +451,10 @@ static void __init sev_map_percpu_data(void)
        }
 }
 
+#ifdef CONFIG_SMP
+
+static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
+
 static bool pv_tlb_flush_supported(void)
 {
        return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
@@ -458,10 +462,6 @@ static bool pv_tlb_flush_supported(void)
                kvm_para_has_feature(KVM_FEATURE_STEAL_TIME));
 }
 
-static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
-
-#ifdef CONFIG_SMP
-
 static bool pv_ipi_supported(void)
 {
        return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI);
@@ -574,6 +574,49 @@ static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
        }
 }
 
+static void kvm_flush_tlb_others(const struct cpumask *cpumask,
+                       const struct flush_tlb_info *info)
+{
+       u8 state;
+       int cpu;
+       struct kvm_steal_time *src;
+       struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
+
+       cpumask_copy(flushmask, cpumask);
+       /*
+        * We have to call flush only on online vCPUs. And
+        * queue flush_on_enter for pre-empted vCPUs
+        */
+       for_each_cpu(cpu, flushmask) {
+               src = &per_cpu(steal_time, cpu);
+               state = READ_ONCE(src->preempted);
+               if ((state & KVM_VCPU_PREEMPTED)) {
+                       if (try_cmpxchg(&src->preempted, &state,
+                                       state | KVM_VCPU_FLUSH_TLB))
+                               __cpumask_clear_cpu(cpu, flushmask);
+               }
+       }
+
+       native_flush_tlb_others(flushmask, info);
+}
+
+static __init int kvm_alloc_cpumask(void)
+{
+       int cpu;
+
+       if (!kvm_para_available() || nopv)
+               return 0;
+
+       if (pv_tlb_flush_supported() || pv_ipi_supported())
+               for_each_possible_cpu(cpu) {
+                       zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
+                               GFP_KERNEL, cpu_to_node(cpu));
+               }
+
+       return 0;
+}
+arch_initcall(kvm_alloc_cpumask);
+
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
        /*
@@ -611,33 +654,8 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
        local_irq_enable();
        return 0;
 }
-#endif
-
-static void kvm_flush_tlb_others(const struct cpumask *cpumask,
-                       const struct flush_tlb_info *info)
-{
-       u8 state;
-       int cpu;
-       struct kvm_steal_time *src;
-       struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
-
-       cpumask_copy(flushmask, cpumask);
-       /*
-        * We have to call flush only on online vCPUs. And
-        * queue flush_on_enter for pre-empted vCPUs
-        */
-       for_each_cpu(cpu, flushmask) {
-               src = &per_cpu(steal_time, cpu);
-               state = READ_ONCE(src->preempted);
-               if ((state & KVM_VCPU_PREEMPTED)) {
-                       if (try_cmpxchg(&src->preempted, &state,
-                                       state | KVM_VCPU_FLUSH_TLB))
-                               __cpumask_clear_cpu(cpu, flushmask);
-               }
-       }
 
-       native_flush_tlb_others(flushmask, info);
-}
+#endif
 
 static void __init kvm_guest_init(void)
 {
@@ -653,12 +671,6 @@ static void __init kvm_guest_init(void)
                pv_ops.time.steal_clock = kvm_steal_clock;
        }
 
-       if (pv_tlb_flush_supported()) {
-               pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
-               pv_ops.mmu.tlb_remove_table = tlb_remove_table;
-               pr_info("KVM setup pv remote TLB flush\n");
-       }
-
        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
                apic_set_eoi_write(kvm_guest_apic_eoi_write);
 
@@ -668,6 +680,12 @@ static void __init kvm_guest_init(void)
        }
 
 #ifdef CONFIG_SMP
+       if (pv_tlb_flush_supported()) {
+               pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
+               pv_ops.mmu.tlb_remove_table = tlb_remove_table;
+               pr_info("KVM setup pv remote TLB flush\n");
+       }
+
        smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
        if (pv_sched_yield_supported()) {
                smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
@@ -734,7 +752,7 @@ static uint32_t __init kvm_detect(void)
 
 static void __init kvm_apic_init(void)
 {
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
        if (pv_ipi_supported())
                kvm_setup_pv_ipi();
 #endif
@@ -794,32 +812,6 @@ static __init int activate_jump_labels(void)
 }
 arch_initcall(activate_jump_labels);
 
-static __init int kvm_alloc_cpumask(void)
-{
-       int cpu;
-       bool alloc = false;
-
-       if (!kvm_para_available() || nopv)
-               return 0;
-
-       if (pv_tlb_flush_supported())
-               alloc = true;
-
-#if defined(CONFIG_SMP)
-       if (pv_ipi_supported())
-               alloc = true;
-#endif
-
-       if (alloc)
-               for_each_possible_cpu(cpu) {
-                       zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
-                               GFP_KERNEL, cpu_to_node(cpu));
-               }
-
-       return 0;
-}
-arch_initcall(kvm_alloc_cpumask);
-
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 
 /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
index a788d51..f6b93a3 100644 (file)
@@ -84,6 +84,18 @@ config KVM_INTEL
          To compile this as a module, choose M here: the module
          will be called kvm-intel.
 
+config X86_SGX_KVM
+       bool "Software Guard eXtensions (SGX) Virtualization"
+       depends on X86_SGX && KVM_INTEL
+       help
+
+         Enables KVM guests to create SGX enclaves.
+
+         This includes support to expose "raw" unreclaimable enclave memory to
+         guests via a device node, e.g. /dev/sgx_vepc.
+
+         If unsure, say N.
+
 config KVM_AMD
        tristate "KVM for AMD processors support"
        depends on KVM
index eafc4d6..c589db5 100644 (file)
@@ -23,6 +23,8 @@ kvm-$(CONFIG_KVM_XEN) += xen.o
 
 kvm-intel-y            += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
                           vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
+kvm-intel-$(CONFIG_X86_SGX_KVM)        += vmx/sgx.o
+
 kvm-amd-y              += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
 
 obj-$(CONFIG_KVM)      += kvm.o
index 6bd2f8b..2ae0615 100644 (file)
@@ -18,6 +18,7 @@
 #include <asm/processor.h>
 #include <asm/user.h>
 #include <asm/fpu/xstate.h>
+#include <asm/sgx.h>
 #include "cpuid.h"
 #include "lapic.h"
 #include "mmu.h"
@@ -28,7 +29,7 @@
  * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
  * aligned to sizeof(unsigned long) because it's not accessed via bitops.
  */
-u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_cpu_caps);
 
 static u32 xstate_required_size(u64 xstate_bv, bool compacted)
@@ -53,6 +54,7 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
 }
 
 #define F feature_bit
+#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
 
 static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
        struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index)
@@ -170,6 +172,21 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
                vcpu->arch.guest_supported_xcr0 =
                        (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
 
+       /*
+        * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
+        * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
+        * requested XCR0 value.  The enclave's XFRM must be a subset of XCRO
+        * at the time of EENTER, thus adjust the allowed XFRM by the guest's
+        * supported XCR0.  Similar to XCR0 handling, FP and SSE are forced to
+        * '1' even on CPUs that don't support XSAVE.
+        */
+       best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1);
+       if (best) {
+               best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff;
+               best->edx &= vcpu->arch.guest_supported_xcr0 >> 32;
+               best->ecx |= XFEATURE_MASK_FPSSE;
+       }
+
        kvm_update_pv_runtime(vcpu);
 
        vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -347,13 +364,13 @@ out:
        return r;
 }
 
-static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+/* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */
+static __always_inline void __kvm_cpu_cap_mask(enum cpuid_leafs leaf)
 {
        const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32);
        struct kvm_cpuid_entry2 entry;
 
        reverse_cpuid_check(leaf);
-       kvm_cpu_caps[leaf] &= mask;
 
        cpuid_count(cpuid.function, cpuid.index,
                    &entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
@@ -361,6 +378,26 @@ static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
        kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg);
 }
 
+static __always_inline void kvm_cpu_cap_init_scattered(enum cpuid_leafs leaf, u32 mask)
+{
+       /* Use kvm_cpu_cap_mask for non-scattered leafs. */
+       BUILD_BUG_ON(leaf < NCAPINTS);
+
+       kvm_cpu_caps[leaf] = mask;
+
+       __kvm_cpu_cap_mask(leaf);
+}
+
+static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+{
+       /* Use kvm_cpu_cap_init_scattered for scattered leafs. */
+       BUILD_BUG_ON(leaf >= NCAPINTS);
+
+       kvm_cpu_caps[leaf] &= mask;
+
+       __kvm_cpu_cap_mask(leaf);
+}
+
 void kvm_set_cpu_caps(void)
 {
        unsigned int f_nx = is_efer_nx() ? F(NX) : 0;
@@ -371,12 +408,13 @@ void kvm_set_cpu_caps(void)
        unsigned int f_gbpages = 0;
        unsigned int f_lm = 0;
 #endif
+       memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
 
-       BUILD_BUG_ON(sizeof(kvm_cpu_caps) >
+       BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) >
                     sizeof(boot_cpu_data.x86_capability));
 
        memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability,
-              sizeof(kvm_cpu_caps));
+              sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)));
 
        kvm_cpu_cap_mask(CPUID_1_ECX,
                /*
@@ -407,7 +445,7 @@ void kvm_set_cpu_caps(void)
        );
 
        kvm_cpu_cap_mask(CPUID_7_0_EBX,
-               F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
+               F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
                F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
                F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
                F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
@@ -418,7 +456,8 @@ void kvm_set_cpu_caps(void)
                F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
                F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
                F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
-               F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/
+               F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
+               F(SGX_LC)
        );
        /* Set LA57 based on hardware capability. */
        if (cpuid_ecx(7) & F(LA57))
@@ -457,6 +496,10 @@ void kvm_set_cpu_caps(void)
                F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES)
        );
 
+       kvm_cpu_cap_init_scattered(CPUID_12_EAX,
+               SF(SGX1) | SF(SGX2)
+       );
+
        kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
                F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
                F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
@@ -778,6 +821,38 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
                        entry->edx = 0;
                }
                break;
+       case 0x12:
+               /* Intel SGX */
+               if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) {
+                       entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+                       break;
+               }
+
+               /*
+                * Index 0: Sub-features, MISCSELECT (a.k.a extended features)
+                * and max enclave sizes.   The SGX sub-features and MISCSELECT
+                * are restricted by kernel and KVM capabilities (like most
+                * feature flags), while enclave size is unrestricted.
+                */
+               cpuid_entry_override(entry, CPUID_12_EAX);
+               entry->ebx &= SGX_MISC_EXINFO;
+
+               entry = do_host_cpuid(array, function, 1);
+               if (!entry)
+                       goto out;
+
+               /*
+                * Index 1: SECS.ATTRIBUTES.  ATTRIBUTES are restricted a la
+                * feature flags.  Advertise all supported flags, including
+                * privileged attributes that require explicit opt-in from
+                * userspace.  ATTRIBUTES.XFRM is not adjusted as userspace is
+                * expected to derive it from supported XCR0.
+                */
+               entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT |
+                             SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY |
+                             SGX_ATTR_KSS;
+               entry->ebx &= 0;
+               break;
        /* Intel PT */
        case 0x14:
                if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) {
index 2a0c506..888e88b 100644 (file)
@@ -7,7 +7,25 @@
 #include <asm/processor.h>
 #include <uapi/asm/kvm_para.h>
 
-extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+/*
+ * Hardware-defined CPUID leafs that are scattered in the kernel, but need to
+ * be directly used by KVM.  Note, these word values conflict with the kernel's
+ * "bug" caps, but KVM doesn't use those.
+ */
+enum kvm_only_cpuid_leafs {
+       CPUID_12_EAX     = NCAPINTS,
+       NR_KVM_CPU_CAPS,
+
+       NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
+};
+
+#define KVM_X86_FEATURE(w, f)          ((w)*32 + (f))
+
+/* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */
+#define KVM_X86_FEATURE_SGX1           KVM_X86_FEATURE(CPUID_12_EAX, 0)
+#define KVM_X86_FEATURE_SGX2           KVM_X86_FEATURE(CPUID_12_EAX, 1)
+
+extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
 void kvm_set_cpu_caps(void);
 
 void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
@@ -80,6 +98,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
        [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
        [CPUID_7_EDX]         = {         7, 0, CPUID_EDX},
        [CPUID_7_1_EAX]       = {         7, 1, CPUID_EAX},
+       [CPUID_12_EAX]        = {0x00000012, 0, CPUID_EAX},
 };
 
 /*
@@ -100,6 +119,25 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
        BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
 }
 
+/*
+ * Translate feature bits that are scattered in the kernel's cpufeatures word
+ * into KVM feature words that align with hardware's definitions.
+ */
+static __always_inline u32 __feature_translate(int x86_feature)
+{
+       if (x86_feature == X86_FEATURE_SGX1)
+               return KVM_X86_FEATURE_SGX1;
+       else if (x86_feature == X86_FEATURE_SGX2)
+               return KVM_X86_FEATURE_SGX2;
+
+       return x86_feature;
+}
+
+static __always_inline u32 __feature_leaf(int x86_feature)
+{
+       return __feature_translate(x86_feature) / 32;
+}
+
 /*
  * Retrieve the bit mask from an X86_FEATURE_* definition.  Features contain
  * the hardware defined bit number (stored in bits 4:0) and a software defined
@@ -108,6 +146,8 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
  */
 static __always_inline u32 __feature_bit(int x86_feature)
 {
+       x86_feature = __feature_translate(x86_feature);
+
        reverse_cpuid_check(x86_feature / 32);
        return 1 << (x86_feature & 31);
 }
@@ -116,7 +156,7 @@ static __always_inline u32 __feature_bit(int x86_feature)
 
 static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature)
 {
-       unsigned int x86_leaf = x86_feature / 32;
+       unsigned int x86_leaf = __feature_leaf(x86_feature);
 
        reverse_cpuid_check(x86_leaf);
        return reverse_cpuid[x86_leaf];
@@ -248,6 +288,14 @@ static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
                is_guest_vendor_hygon(best->ebx, best->ecx, best->edx));
 }
 
+static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu)
+{
+       struct kvm_cpuid_entry2 *best;
+
+       best = kvm_find_cpuid_entry(vcpu, 0, 0);
+       return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx);
+}
+
 static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
 {
        struct kvm_cpuid_entry2 *best;
@@ -308,7 +356,7 @@ static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
 
 static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
 {
-       unsigned int x86_leaf = x86_feature / 32;
+       unsigned int x86_leaf = __feature_leaf(x86_feature);
 
        reverse_cpuid_check(x86_leaf);
        kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
@@ -316,7 +364,7 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
 
 static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
 {
-       unsigned int x86_leaf = x86_feature / 32;
+       unsigned int x86_leaf = __feature_leaf(x86_feature);
 
        reverse_cpuid_check(x86_leaf);
        kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
@@ -324,7 +372,7 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
 
 static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature)
 {
-       unsigned int x86_leaf = x86_feature / 32;
+       unsigned int x86_leaf = __feature_leaf(x86_feature);
 
        reverse_cpuid_check(x86_leaf);
        return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature);
index cc369b9..0050f39 100644 (file)
@@ -2869,7 +2869,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
                return;
 
        if (is_guest_mode(vcpu)) {
-               r = kvm_x86_ops.nested_ops->check_events(vcpu);
+               r = kvm_check_nested_events(vcpu);
                if (r < 0)
                        return;
                /*
index c68bfc3..88d0ed5 100644 (file)
@@ -59,7 +59,8 @@ static __always_inline u64 rsvd_bits(int s, int e)
        return ((2ULL << (e - s)) - 1) << s;
 }
 
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask);
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
 
 void
 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
@@ -73,6 +74,10 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
                                u64 fault_address, char *insn, int insn_len);
 
+int kvm_mmu_load(struct kvm_vcpu *vcpu);
+void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+
 static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
 {
        if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE))
@@ -102,8 +107,8 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
        if (!VALID_PAGE(root_hpa))
                return;
 
-       static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa | kvm_get_active_pcid(vcpu),
-                                vcpu->arch.mmu->shadow_root_level);
+       static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa,
+                                         vcpu->arch.mmu->shadow_root_level);
 }
 
 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
@@ -124,7 +129,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
  * write-protects guest page to sync the guest modification, b) another one is
  * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences
  * between these two sorts are:
- * 1) the first case clears SPTE_MMU_WRITEABLE bit.
+ * 1) the first case clears MMU-writable bit.
  * 2) the first case requires flushing tlb immediately avoiding corrupting
  *    shadow page table between all vcpus so it should be in the protection of
  *    mmu-lock. And the another case does not need to flush tlb until returning
@@ -135,17 +140,17 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
  * So, there is the problem: the first case can meet the corrupted tlb caused
  * by another case which write-protects pages but without flush tlb
  * immediately. In order to making the first case be aware this problem we let
- * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit
- * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit.
+ * it flush tlb if we try to write-protect a spte whose MMU-writable bit
+ * is set, it works since another case never touches MMU-writable bit.
  *
  * Anyway, whenever a spte is updated (only permission and status bits are
- * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes
+ * changed) we need to check whether the spte with MMU-writable becomes
  * readonly, if that happens, we need to flush tlb. Fortunately,
  * mmu_spte_update() has already handled it perfectly.
  *
- * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK:
+ * The rules to use MMU-writable and PT_WRITABLE_MASK:
  * - if we want to see if it has writable tlb entry or if the spte can be
- *   writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most
+ *   writable on the mmu mapping, check MMU-writable, this is the most
  *   case, otherwise
  * - if we fix page fault on the spte or do write-protection by dirty logging,
  *   check PT_WRITABLE_MASK.
index 486aa94..930ac8a 100644 (file)
@@ -48,6 +48,7 @@
 #include <asm/memtype.h>
 #include <asm/cmpxchg.h>
 #include <asm/io.h>
+#include <asm/set_memory.h>
 #include <asm/vmx.h>
 #include <asm/kvm_page_track.h>
 #include "trace.h"
@@ -215,10 +216,10 @@ bool is_nx_huge_page_enabled(void)
 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
                           unsigned int access)
 {
-       u64 mask = make_mmio_spte(vcpu, gfn, access);
+       u64 spte = make_mmio_spte(vcpu, gfn, access);
 
-       trace_mark_mmio_spte(sptep, gfn, mask);
-       mmu_spte_set(sptep, mask);
+       trace_mark_mmio_spte(sptep, gfn, spte);
+       mmu_spte_set(sptep, spte);
 }
 
 static gfn_t get_mmio_spte_gfn(u64 spte)
@@ -236,17 +237,6 @@ static unsigned get_mmio_spte_access(u64 spte)
        return spte & shadow_mmio_access_mask;
 }
 
-static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
-                         kvm_pfn_t pfn, unsigned int access)
-{
-       if (unlikely(is_noslot_pfn(pfn))) {
-               mark_mmio_spte(vcpu, sptep, gfn, access);
-               return true;
-       }
-
-       return false;
-}
-
 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 {
        u64 kvm_gen, spte_gen, gen;
@@ -725,8 +715,7 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
  * handling slots that are not large page aligned.
  */
 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
-                                             struct kvm_memory_slot *slot,
-                                             int level)
+               const struct kvm_memory_slot *slot, int level)
 {
        unsigned long idx;
 
@@ -1118,7 +1107,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
        rmap_printk("spte %p %llx\n", sptep, *sptep);
 
        if (pt_protect)
-               spte &= ~SPTE_MMU_WRITEABLE;
+               spte &= ~shadow_mmu_writable_mask;
        spte = spte & ~PT_WRITABLE_MASK;
 
        return mmu_spte_update(sptep, spte);
@@ -1308,26 +1297,25 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
        return flush;
 }
 
-static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-                          struct kvm_memory_slot *slot, gfn_t gfn, int level,
-                          unsigned long data)
+static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+                           struct kvm_memory_slot *slot, gfn_t gfn, int level,
+                           pte_t unused)
 {
        return kvm_zap_rmapp(kvm, rmap_head, slot);
 }
 
-static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-                            struct kvm_memory_slot *slot, gfn_t gfn, int level,
-                            unsigned long data)
+static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+                             struct kvm_memory_slot *slot, gfn_t gfn, int level,
+                             pte_t pte)
 {
        u64 *sptep;
        struct rmap_iterator iter;
        int need_flush = 0;
        u64 new_spte;
-       pte_t *ptep = (pte_t *)data;
        kvm_pfn_t new_pfn;
 
-       WARN_ON(pte_huge(*ptep));
-       new_pfn = pte_pfn(*ptep);
+       WARN_ON(pte_huge(pte));
+       new_pfn = pte_pfn(pte);
 
 restart:
        for_each_rmap_spte(rmap_head, &iter, sptep) {
@@ -1336,7 +1324,7 @@ restart:
 
                need_flush = 1;
 
-               if (pte_write(*ptep)) {
+               if (pte_write(pte)) {
                        pte_list_remove(rmap_head, sptep);
                        goto restart;
                } else {
@@ -1424,93 +1412,52 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
             slot_rmap_walk_okay(_iter_);                               \
             slot_rmap_walk_next(_iter_))
 
-static __always_inline int
-kvm_handle_hva_range(struct kvm *kvm,
-                    unsigned long start,
-                    unsigned long end,
-                    unsigned long data,
-                    int (*handler)(struct kvm *kvm,
-                                   struct kvm_rmap_head *rmap_head,
-                                   struct kvm_memory_slot *slot,
-                                   gfn_t gfn,
-                                   int level,
-                                   unsigned long data))
+typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+                              struct kvm_memory_slot *slot, gfn_t gfn,
+                              int level, pte_t pte);
+
+static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
+                                                struct kvm_gfn_range *range,
+                                                rmap_handler_t handler)
 {
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
        struct slot_rmap_walk_iterator iterator;
-       int ret = 0;
-       int i;
-
-       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
-               slots = __kvm_memslots(kvm, i);
-               kvm_for_each_memslot(memslot, slots) {
-                       unsigned long hva_start, hva_end;
-                       gfn_t gfn_start, gfn_end;
+       bool ret = false;
 
-                       hva_start = max(start, memslot->userspace_addr);
-                       hva_end = min(end, memslot->userspace_addr +
-                                     (memslot->npages << PAGE_SHIFT));
-                       if (hva_start >= hva_end)
-                               continue;
-                       /*
-                        * {gfn(page) | page intersects with [hva_start, hva_end)} =
-                        * {gfn_start, gfn_start+1, ..., gfn_end-1}.
-                        */
-                       gfn_start = hva_to_gfn_memslot(hva_start, memslot);
-                       gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
-                       for_each_slot_rmap_range(memslot, PG_LEVEL_4K,
-                                                KVM_MAX_HUGEPAGE_LEVEL,
-                                                gfn_start, gfn_end - 1,
-                                                &iterator)
-                               ret |= handler(kvm, iterator.rmap, memslot,
-                                              iterator.gfn, iterator.level, data);
-               }
-       }
+       for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+                                range->start, range->end - 1, &iterator)
+               ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
+                              iterator.level, range->pte);
 
        return ret;
 }
 
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
-                         unsigned long data,
-                         int (*handler)(struct kvm *kvm,
-                                        struct kvm_rmap_head *rmap_head,
-                                        struct kvm_memory_slot *slot,
-                                        gfn_t gfn, int level,
-                                        unsigned long data))
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
-}
-
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
-                       unsigned flags)
-{
-       int r;
+       bool flush;
 
-       r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
+       flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
 
        if (is_tdp_mmu_enabled(kvm))
-               r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end);
+               flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
 
-       return r;
+       return flush;
 }
 
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       int r;
+       bool flush;
 
-       r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
+       flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp);
 
        if (is_tdp_mmu_enabled(kvm))
-               r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte);
+               flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
 
-       return r;
+       return flush;
 }
 
-static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-                        struct kvm_memory_slot *slot, gfn_t gfn, int level,
-                        unsigned long data)
+static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+                         struct kvm_memory_slot *slot, gfn_t gfn, int level,
+                         pte_t unused)
 {
        u64 *sptep;
        struct rmap_iterator iter;
@@ -1519,13 +1466,12 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
        for_each_rmap_spte(rmap_head, &iter, sptep)
                young |= mmu_spte_age(sptep);
 
-       trace_kvm_age_page(gfn, level, slot, young);
        return young;
 }
 
-static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-                             struct kvm_memory_slot *slot, gfn_t gfn,
-                             int level, unsigned long data)
+static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+                              struct kvm_memory_slot *slot, gfn_t gfn,
+                              int level, pte_t unused)
 {
        u64 *sptep;
        struct rmap_iterator iter;
@@ -1547,29 +1493,31 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
        rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
 
-       kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
+       kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
        kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
                        KVM_PAGES_PER_HPAGE(sp->role.level));
 }
 
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       int young = false;
+       bool young;
+
+       young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp);
 
-       young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
        if (is_tdp_mmu_enabled(kvm))
-               young |= kvm_tdp_mmu_age_hva_range(kvm, start, end);
+               young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
 
        return young;
 }
 
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       int young = false;
+       bool young;
+
+       young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp);
 
-       young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
        if (is_tdp_mmu_enabled(kvm))
-               young |= kvm_tdp_mmu_test_age_hva(kvm, hva);
+               young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
 
        return young;
 }
@@ -2421,6 +2369,15 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
 
        kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
 
+       /*
+        * Note, this check is intentionally soft, it only guarantees that one
+        * page is available, while the caller may end up allocating as many as
+        * four pages, e.g. for PAE roots or for 5-level paging.  Temporarily
+        * exceeding the (arbitrary by default) limit will not harm the host,
+        * being too agressive may unnecessarily kill the guest, and getting an
+        * exact count is far more trouble than it's worth, especially in the
+        * page fault paths.
+        */
        if (!kvm_mmu_available_pages(vcpu->kvm))
                return -ENOSPC;
        return 0;
@@ -2561,9 +2518,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
        struct kvm_mmu_page *sp;
        int ret;
 
-       if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
-               return 0;
-
        sp = sptep_to_sp(sptep);
 
        ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative,
@@ -2593,6 +2547,11 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
        pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
                 *sptep, write_fault, gfn);
 
+       if (unlikely(is_noslot_pfn(pfn))) {
+               mark_mmio_spte(vcpu, sptep, gfn, pte_access);
+               return RET_PF_EMULATE;
+       }
+
        if (is_shadow_present_pte(*sptep)) {
                /*
                 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
@@ -2626,9 +2585,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
                                KVM_PAGES_PER_HPAGE(level));
 
-       if (unlikely(is_mmio_spte(*sptep)))
-               ret = RET_PF_EMULATE;
-
        /*
         * The fault is fully spurious if and only if the new SPTE and old SPTE
         * are identical, and emulation is not required.
@@ -2745,7 +2701,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 }
 
 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
-                                 struct kvm_memory_slot *slot)
+                                 const struct kvm_memory_slot *slot)
 {
        unsigned long hva;
        pte_t *pte;
@@ -2771,8 +2727,9 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
        return level;
 }
 
-int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
-                             gfn_t gfn, kvm_pfn_t pfn, int max_level)
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+                             const struct kvm_memory_slot *slot, gfn_t gfn,
+                             kvm_pfn_t pfn, int max_level)
 {
        struct kvm_lpage_info *linfo;
 
@@ -2946,9 +2903,19 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
                return true;
        }
 
-       if (unlikely(is_noslot_pfn(pfn)))
+       if (unlikely(is_noslot_pfn(pfn))) {
                vcpu_cache_mmio_info(vcpu, gva, gfn,
                                     access & shadow_mmio_access_mask);
+               /*
+                * If MMIO caching is disabled, emulate immediately without
+                * touching the shadow page tables as attempting to install an
+                * MMIO SPTE will just be an expensive nop.
+                */
+               if (unlikely(!shadow_mmio_value)) {
+                       *ret_val = RET_PF_EMULATE;
+                       return true;
+               }
+       }
 
        return false;
 }
@@ -3061,6 +3028,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                        if (!is_shadow_present_pte(spte))
                                break;
 
+               if (!is_shadow_present_pte(spte))
+                       break;
+
                sp = sptep_to_sp(iterator.sptep);
                if (!is_last_spte(spte, sp->role.level))
                        break;
@@ -3150,12 +3120,10 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
 
        sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
 
-       if (kvm_mmu_put_root(kvm, sp)) {
-               if (is_tdp_mmu_page(sp))
-                       kvm_tdp_mmu_free_root(kvm, sp);
-               else if (sp->role.invalid)
-                       kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
-       }
+       if (is_tdp_mmu_page(sp))
+               kvm_tdp_mmu_put_root(kvm, sp, false);
+       else if (!--sp->root_count && sp->role.invalid)
+               kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
 
        *root_hpa = INVALID_PAGE;
 }
@@ -3193,14 +3161,17 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
                    (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
                        mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list);
-               } else {
-                       for (i = 0; i < 4; ++i)
-                               if (mmu->pae_root[i] != 0)
-                                       mmu_free_root_page(kvm,
-                                                          &mmu->pae_root[i],
-                                                          &invalid_list);
-                       mmu->root_hpa = INVALID_PAGE;
+               } else if (mmu->pae_root) {
+                       for (i = 0; i < 4; ++i) {
+                               if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
+                                       continue;
+
+                               mmu_free_root_page(kvm, &mmu->pae_root[i],
+                                                  &invalid_list);
+                               mmu->pae_root[i] = INVALID_PAE_ROOT;
+                       }
                }
+               mmu->root_hpa = INVALID_PAGE;
                mmu->root_pgd = 0;
        }
 
@@ -3226,155 +3197,208 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
 {
        struct kvm_mmu_page *sp;
 
-       write_lock(&vcpu->kvm->mmu_lock);
-
-       if (make_mmu_pages_available(vcpu)) {
-               write_unlock(&vcpu->kvm->mmu_lock);
-               return INVALID_PAGE;
-       }
        sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
        ++sp->root_count;
 
-       write_unlock(&vcpu->kvm->mmu_lock);
        return __pa(sp->spt);
 }
 
 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 {
-       u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level;
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
+       u8 shadow_root_level = mmu->shadow_root_level;
        hpa_t root;
        unsigned i;
+       int r;
+
+       write_lock(&vcpu->kvm->mmu_lock);
+       r = make_mmu_pages_available(vcpu);
+       if (r < 0)
+               goto out_unlock;
 
        if (is_tdp_mmu_enabled(vcpu->kvm)) {
                root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
-
-               if (!VALID_PAGE(root))
-                       return -ENOSPC;
-               vcpu->arch.mmu->root_hpa = root;
+               mmu->root_hpa = root;
        } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
-               root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
-                                     true);
-
-               if (!VALID_PAGE(root))
-                       return -ENOSPC;
-               vcpu->arch.mmu->root_hpa = root;
+               root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
+               mmu->root_hpa = root;
        } else if (shadow_root_level == PT32E_ROOT_LEVEL) {
+               if (WARN_ON_ONCE(!mmu->pae_root)) {
+                       r = -EIO;
+                       goto out_unlock;
+               }
+
                for (i = 0; i < 4; ++i) {
-                       MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
+                       WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
 
                        root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
                                              i << 30, PT32_ROOT_LEVEL, true);
-                       if (!VALID_PAGE(root))
-                               return -ENOSPC;
-                       vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
+                       mmu->pae_root[i] = root | PT_PRESENT_MASK |
+                                          shadow_me_mask;
                }
-               vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
-       } else
-               BUG();
+               mmu->root_hpa = __pa(mmu->pae_root);
+       } else {
+               WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
+               r = -EIO;
+               goto out_unlock;
+       }
 
        /* root_pgd is ignored for direct MMUs. */
-       vcpu->arch.mmu->root_pgd = 0;
-
-       return 0;
+       mmu->root_pgd = 0;
+out_unlock:
+       write_unlock(&vcpu->kvm->mmu_lock);
+       return r;
 }
 
 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 {
-       u64 pdptr, pm_mask;
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
+       u64 pdptrs[4], pm_mask;
        gfn_t root_gfn, root_pgd;
        hpa_t root;
-       int i;
+       unsigned i;
+       int r;
 
-       root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu);
+       root_pgd = mmu->get_guest_pgd(vcpu);
        root_gfn = root_pgd >> PAGE_SHIFT;
 
        if (mmu_check_root(vcpu, root_gfn))
                return 1;
 
+       /*
+        * On SVM, reading PDPTRs might access guest memory, which might fault
+        * and thus might sleep.  Grab the PDPTRs before acquiring mmu_lock.
+        */
+       if (mmu->root_level == PT32E_ROOT_LEVEL) {
+               for (i = 0; i < 4; ++i) {
+                       pdptrs[i] = mmu->get_pdptr(vcpu, i);
+                       if (!(pdptrs[i] & PT_PRESENT_MASK))
+                               continue;
+
+                       if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
+                               return 1;
+               }
+       }
+
+       write_lock(&vcpu->kvm->mmu_lock);
+       r = make_mmu_pages_available(vcpu);
+       if (r < 0)
+               goto out_unlock;
+
        /*
         * Do we shadow a long mode page table? If so we need to
         * write-protect the guests page table root.
         */
-       if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
-               MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa));
-
+       if (mmu->root_level >= PT64_ROOT_4LEVEL) {
                root = mmu_alloc_root(vcpu, root_gfn, 0,
-                                     vcpu->arch.mmu->shadow_root_level, false);
-               if (!VALID_PAGE(root))
-                       return -ENOSPC;
-               vcpu->arch.mmu->root_hpa = root;
+                                     mmu->shadow_root_level, false);
+               mmu->root_hpa = root;
                goto set_root_pgd;
        }
 
+       if (WARN_ON_ONCE(!mmu->pae_root)) {
+               r = -EIO;
+               goto out_unlock;
+       }
+
        /*
         * We shadow a 32 bit page table. This may be a legacy 2-level
         * or a PAE 3-level page table. In either case we need to be aware that
         * the shadow page table may be a PAE or a long mode page table.
         */
-       pm_mask = PT_PRESENT_MASK;
-       if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+       pm_mask = PT_PRESENT_MASK | shadow_me_mask;
+       if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
                pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
 
+               if (WARN_ON_ONCE(!mmu->lm_root)) {
+                       r = -EIO;
+                       goto out_unlock;
+               }
+
+               mmu->lm_root[0] = __pa(mmu->pae_root) | pm_mask;
+       }
+
        for (i = 0; i < 4; ++i) {
-               MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
-               if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
-                       pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
-                       if (!(pdptr & PT_PRESENT_MASK)) {
-                               vcpu->arch.mmu->pae_root[i] = 0;
+               WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
+
+               if (mmu->root_level == PT32E_ROOT_LEVEL) {
+                       if (!(pdptrs[i] & PT_PRESENT_MASK)) {
+                               mmu->pae_root[i] = INVALID_PAE_ROOT;
                                continue;
                        }
-                       root_gfn = pdptr >> PAGE_SHIFT;
-                       if (mmu_check_root(vcpu, root_gfn))
-                               return 1;
+                       root_gfn = pdptrs[i] >> PAGE_SHIFT;
                }
 
                root = mmu_alloc_root(vcpu, root_gfn, i << 30,
                                      PT32_ROOT_LEVEL, false);
-               if (!VALID_PAGE(root))
-                       return -ENOSPC;
-               vcpu->arch.mmu->pae_root[i] = root | pm_mask;
+               mmu->pae_root[i] = root | pm_mask;
        }
-       vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
+
+       if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+               mmu->root_hpa = __pa(mmu->lm_root);
+       else
+               mmu->root_hpa = __pa(mmu->pae_root);
+
+set_root_pgd:
+       mmu->root_pgd = root_pgd;
+out_unlock:
+       write_unlock(&vcpu->kvm->mmu_lock);
+
+       return 0;
+}
+
+static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
+{
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
+       u64 *lm_root, *pae_root;
 
        /*
-        * If we shadow a 32 bit page table with a long mode page
-        * table we enter this path.
+        * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
+        * tables are allocated and initialized at root creation as there is no
+        * equivalent level in the guest's NPT to shadow.  Allocate the tables
+        * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
         */
-       if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
-               if (vcpu->arch.mmu->lm_root == NULL) {
-                       /*
-                        * The additional page necessary for this is only
-                        * allocated on demand.
-                        */
+       if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL ||
+           mmu->shadow_root_level < PT64_ROOT_4LEVEL)
+               return 0;
 
-                       u64 *lm_root;
+       /*
+        * This mess only works with 4-level paging and needs to be updated to
+        * work with 5-level paging.
+        */
+       if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
+               return -EIO;
 
-                       lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
-                       if (lm_root == NULL)
-                               return 1;
+       if (mmu->pae_root && mmu->lm_root)
+               return 0;
 
-                       lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
+       /*
+        * The special roots should always be allocated in concert.  Yell and
+        * bail if KVM ends up in a state where only one of the roots is valid.
+        */
+       if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->lm_root))
+               return -EIO;
 
-                       vcpu->arch.mmu->lm_root = lm_root;
-               }
+       /*
+        * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
+        * doesn't need to be decrypted.
+        */
+       pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+       if (!pae_root)
+               return -ENOMEM;
 
-               vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
+       lm_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+       if (!lm_root) {
+               free_page((unsigned long)pae_root);
+               return -ENOMEM;
        }
 
-set_root_pgd:
-       vcpu->arch.mmu->root_pgd = root_pgd;
+       mmu->pae_root = pae_root;
+       mmu->lm_root = lm_root;
 
        return 0;
 }
 
-static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
-{
-       if (vcpu->arch.mmu->direct_map)
-               return mmu_alloc_direct_roots(vcpu);
-       else
-               return mmu_alloc_shadow_roots(vcpu);
-}
-
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 {
        int i;
@@ -3422,7 +3446,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
        for (i = 0; i < 4; ++i) {
                hpa_t root = vcpu->arch.mmu->pae_root[i];
 
-               if (root && VALID_PAGE(root)) {
+               if (IS_VALID_PAE_ROOT(root)) {
                        root &= PT64_BASE_ADDR_MASK;
                        sp = to_shadow_page(root);
                        mmu_sync_children(vcpu, sp);
@@ -3554,11 +3578,12 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
                            __is_rsvd_bits_set(rsvd_check, sptes[level], level);
 
        if (reserved) {
-               pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
+               pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
                       __func__, addr);
                for (level = root; level >= leaf; level--)
-                       pr_err("------ spte 0x%llx level %d.\n",
-                              sptes[level], level);
+                       pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
+                              sptes[level], level,
+                              rsvd_check->rsvd_bits_mask[(sptes[level] >> 7) & 1][level-1]);
        }
 
        return reserved;
@@ -3653,6 +3678,14 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
        struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
        bool async;
 
+       /*
+        * Retry the page fault if the gfn hit a memslot that is being deleted
+        * or moved.  This ensures any existing SPTEs for the old memslot will
+        * be zapped before KVM inserts a new MMIO SPTE for the gfn.
+        */
+       if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
+               return true;
+
        /* Don't expose private memslots to L2. */
        if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
                *pfn = KVM_PFN_NOSLOT;
@@ -4615,12 +4648,17 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer,
        struct kvm_mmu *context = &vcpu->arch.guest_mmu;
        union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu);
 
-       context->shadow_root_level = new_role.base.level;
-
        __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false);
 
-       if (new_role.as_u64 != context->mmu_role.as_u64)
+       if (new_role.as_u64 != context->mmu_role.as_u64) {
                shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
+
+               /*
+                * Override the level set by the common init helper, nested TDP
+                * always uses the host's TDP configuration.
+                */
+               context->shadow_root_level = new_role.base.level;
+       }
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
 
@@ -4802,16 +4840,23 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
        r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
        if (r)
                goto out;
-       r = mmu_alloc_roots(vcpu);
-       kvm_mmu_sync_roots(vcpu);
+       r = mmu_alloc_special_roots(vcpu);
+       if (r)
+               goto out;
+       if (vcpu->arch.mmu->direct_map)
+               r = mmu_alloc_direct_roots(vcpu);
+       else
+               r = mmu_alloc_shadow_roots(vcpu);
        if (r)
                goto out;
+
+       kvm_mmu_sync_roots(vcpu);
+
        kvm_mmu_load_pgd(vcpu);
        static_call(kvm_x86_tlb_flush_current)(vcpu);
 out:
        return r;
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_load);
 
 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
@@ -4820,7 +4865,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
        kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
        WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_unload);
 
 static bool need_remote_flush(u64 old, u64 new)
 {
@@ -5169,10 +5213,10 @@ typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_
 static __always_inline bool
 slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
                        slot_level_handler fn, int start_level, int end_level,
-                       gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
+                       gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
+                       bool flush)
 {
        struct slot_rmap_walk_iterator iterator;
-       bool flush = false;
 
        for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
                        end_gfn, &iterator) {
@@ -5180,7 +5224,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
                        flush |= fn(kvm, iterator.rmap, memslot);
 
                if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
-                       if (flush && lock_flush_tlb) {
+                       if (flush && flush_on_yield) {
                                kvm_flush_remote_tlbs_with_address(kvm,
                                                start_gfn,
                                                iterator.gfn - start_gfn + 1);
@@ -5190,36 +5234,32 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
                }
        }
 
-       if (flush && lock_flush_tlb) {
-               kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
-                                                  end_gfn - start_gfn + 1);
-               flush = false;
-       }
-
        return flush;
 }
 
 static __always_inline bool
 slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
                  slot_level_handler fn, int start_level, int end_level,
-                 bool lock_flush_tlb)
+                 bool flush_on_yield)
 {
        return slot_handle_level_range(kvm, memslot, fn, start_level,
                        end_level, memslot->base_gfn,
                        memslot->base_gfn + memslot->npages - 1,
-                       lock_flush_tlb);
+                       flush_on_yield, false);
 }
 
 static __always_inline bool
 slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                slot_level_handler fn, bool lock_flush_tlb)
+                slot_level_handler fn, bool flush_on_yield)
 {
        return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
-                                PG_LEVEL_4K, lock_flush_tlb);
+                                PG_LEVEL_4K, flush_on_yield);
 }
 
 static void free_mmu_pages(struct kvm_mmu *mmu)
 {
+       if (!tdp_enabled && mmu->pae_root)
+               set_memory_encrypted((unsigned long)mmu->pae_root, 1);
        free_page((unsigned long)mmu->pae_root);
        free_page((unsigned long)mmu->lm_root);
 }
@@ -5240,9 +5280,11 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
         * while the PDP table is a per-vCPU construct that's allocated at MMU
         * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
         * x86_64.  Therefore we need to allocate the PDP table in the first
-        * 4GB of memory, which happens to fit the DMA32 zone.  Except for
-        * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
-        * skip allocating the PDP table.
+        * 4GB of memory, which happens to fit the DMA32 zone.  TDP paging
+        * generally doesn't use PAE paging and can skip allocating the PDP
+        * table.  The main exception, handled here, is SVM's 32-bit NPT.  The
+        * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
+        * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots().
         */
        if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
                return 0;
@@ -5252,8 +5294,22 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
                return -ENOMEM;
 
        mmu->pae_root = page_address(page);
+
+       /*
+        * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
+        * get the CPU to treat the PDPTEs as encrypted.  Decrypt the page so
+        * that KVM's writes and the CPU's reads get along.  Note, this is
+        * only necessary when using shadow paging, as 64-bit NPT can get at
+        * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
+        * by 32-bit kernels (when KVM itself uses 32-bit NPT).
+        */
+       if (!tdp_enabled)
+               set_memory_decrypted((unsigned long)mmu->pae_root, 1);
+       else
+               WARN_ON_ONCE(shadow_me_mask);
+
        for (i = 0; i < 4; ++i)
-               mmu->pae_root[i] = INVALID_PAGE;
+               mmu->pae_root[i] = INVALID_PAE_ROOT;
 
        return 0;
 }
@@ -5365,6 +5421,15 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
         */
        kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
 
+       /* In order to ensure all threads see this change when
+        * handling the MMU reload signal, this must happen in the
+        * same critical section as kvm_reload_remote_mmus, and
+        * before kvm_zap_obsolete_pages as kvm_zap_obsolete_pages
+        * could drop the MMU lock and yield.
+        */
+       if (is_tdp_mmu_enabled(kvm))
+               kvm_tdp_mmu_invalidate_all_roots(kvm);
+
        /*
         * Notify all vcpus to reload its shadow page table and flush TLB.
         * Then all vcpus will switch to new shadow page table with the new
@@ -5377,10 +5442,13 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 
        kvm_zap_obsolete_pages(kvm);
 
-       if (is_tdp_mmu_enabled(kvm))
-               kvm_tdp_mmu_zap_all(kvm);
-
        write_unlock(&kvm->mmu_lock);
+
+       if (is_tdp_mmu_enabled(kvm)) {
+               read_lock(&kvm->mmu_lock);
+               kvm_tdp_mmu_zap_invalidated_roots(kvm);
+               read_unlock(&kvm->mmu_lock);
+       }
 }
 
 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
@@ -5420,7 +5488,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
        int i;
-       bool flush;
+       bool flush = false;
 
        write_lock(&kvm->mmu_lock);
        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
@@ -5433,20 +5501,31 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
                        if (start >= end)
                                continue;
 
-                       slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
-                                               PG_LEVEL_4K,
-                                               KVM_MAX_HUGEPAGE_LEVEL,
-                                               start, end - 1, true);
+                       flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+                                                       PG_LEVEL_4K,
+                                                       KVM_MAX_HUGEPAGE_LEVEL,
+                                                       start, end - 1, true, flush);
                }
        }
 
+       if (flush)
+               kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
+
+       write_unlock(&kvm->mmu_lock);
+
        if (is_tdp_mmu_enabled(kvm)) {
-               flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
+               flush = false;
+
+               read_lock(&kvm->mmu_lock);
+               for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+                       flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
+                                                         gfn_end, flush, true);
                if (flush)
-                       kvm_flush_remote_tlbs(kvm);
-       }
+                       kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
+                                                          gfn_end);
 
-       write_unlock(&kvm->mmu_lock);
+               read_unlock(&kvm->mmu_lock);
+       }
 }
 
 static bool slot_rmap_write_protect(struct kvm *kvm,
@@ -5465,10 +5544,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
        write_lock(&kvm->mmu_lock);
        flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
                                start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
-       if (is_tdp_mmu_enabled(kvm))
-               flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K);
        write_unlock(&kvm->mmu_lock);
 
+       if (is_tdp_mmu_enabled(kvm)) {
+               read_lock(&kvm->mmu_lock);
+               flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
+               read_unlock(&kvm->mmu_lock);
+       }
+
        /*
         * We can flush all the TLBs out of the mmu lock without TLB
         * corruption since we just change the spte from writable to
@@ -5476,9 +5559,9 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
         * spte from present to present (changing the spte from present
         * to nonpresent will flush all the TLBs immediately), in other
         * words, the only case we care is mmu_spte_update() where we
-        * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
-        * instead of PT_WRITABLE_MASK, that means it does not depend
-        * on PT_WRITABLE_MASK anymore.
+        * have checked Host-writable | MMU-writable instead of
+        * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK
+        * anymore.
         */
        if (flush)
                kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
@@ -5529,21 +5612,32 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
 {
        /* FIXME: const-ify all uses of struct kvm_memory_slot.  */
        struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot;
+       bool flush;
 
        write_lock(&kvm->mmu_lock);
-       slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
+       flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
 
-       if (is_tdp_mmu_enabled(kvm))
-               kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
+       if (flush)
+               kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
        write_unlock(&kvm->mmu_lock);
+
+       if (is_tdp_mmu_enabled(kvm)) {
+               flush = false;
+
+               read_lock(&kvm->mmu_lock);
+               flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush);
+               if (flush)
+                       kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+               read_unlock(&kvm->mmu_lock);
+       }
 }
 
 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-                                       struct kvm_memory_slot *memslot)
+                                       const struct kvm_memory_slot *memslot)
 {
        /*
         * All current use cases for flushing the TLBs for a specific memslot
-        * are related to dirty logging, and do the TLB flush out of mmu_lock.
+        * related to dirty logging, and many do the TLB flush out of mmu_lock.
         * The interaction between the various operations on memslot must be
         * serialized by slots_locks to ensure the TLB flush from one operation
         * is observed by any other operation on the same memslot.
@@ -5560,10 +5654,14 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 
        write_lock(&kvm->mmu_lock);
        flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
-       if (is_tdp_mmu_enabled(kvm))
-               flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
        write_unlock(&kvm->mmu_lock);
 
+       if (is_tdp_mmu_enabled(kvm)) {
+               read_lock(&kvm->mmu_lock);
+               flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
+               read_unlock(&kvm->mmu_lock);
+       }
+
        /*
         * It's also safe to flush TLBs out of mmu lock here as currently this
         * function is only used for dirty logging, in which case flushing TLB
@@ -5701,25 +5799,6 @@ static void mmu_destroy_caches(void)
        kmem_cache_destroy(mmu_page_header_cache);
 }
 
-static void kvm_set_mmio_spte_mask(void)
-{
-       u64 mask;
-
-       /*
-        * Set a reserved PA bit in MMIO SPTEs to generate page faults with
-        * PFEC.RSVD=1 on MMIO accesses.  64-bit PTEs (PAE, x86-64, and EPT
-        * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
-        * 52-bit physical addresses then there are no reserved PA bits in the
-        * PTEs and so the reserved PA approach must be disabled.
-        */
-       if (shadow_phys_bits < 52)
-               mask = BIT_ULL(51) | PT_PRESENT_MASK;
-       else
-               mask = 0;
-
-       kvm_mmu_set_mmio_spte_mask(mask, ACC_WRITE_MASK | ACC_USER_MASK);
-}
-
 static bool get_nx_auto_mode(void)
 {
        /* Return true when CPU has the bug, and mitigations are ON */
@@ -5785,8 +5864,6 @@ int kvm_mmu_module_init(void)
 
        kvm_mmu_reset_all_pte_masks();
 
-       kvm_set_mmio_spte_mask();
-
        pte_list_desc_cache = kmem_cache_create("pte_list_desc",
                                            sizeof(struct pte_list_desc),
                                            0, SLAB_ACCOUNT, NULL);
index ced15fd..cedc17b 100644 (file)
@@ -70,7 +70,7 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
        for (i = 0; i < 4; ++i) {
                hpa_t root = vcpu->arch.mmu->pae_root[i];
 
-               if (root && VALID_PAGE(root)) {
+               if (IS_VALID_PAE_ROOT(root)) {
                        root &= PT64_BASE_ADDR_MASK;
                        sp = to_shadow_page(root);
                        __mmu_spte_walk(vcpu, sp, fn, 2);
index 1f6f98c..f2546d6 100644 (file)
@@ -20,6 +20,16 @@ extern bool dbg;
 #define MMU_WARN_ON(x) do { } while (0)
 #endif
 
+/*
+ * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT
+ * bit, and thus are guaranteed to be non-zero when valid.  And, when a guest
+ * PDPTR is !PRESENT, its corresponding PAE root cannot be set to INVALID_PAGE,
+ * as the CPU would treat that as PRESENT PDPTR with reserved bits set.  Use
+ * '0' instead of INVALID_PAGE to indicate an invalid PAE root.
+ */
+#define INVALID_PAE_ROOT       0
+#define IS_VALID_PAE_ROOT(x)   (!!(x))
+
 struct kvm_mmu_page {
        struct list_head link;
        struct hlist_node hash_link;
@@ -40,7 +50,11 @@ struct kvm_mmu_page {
        u64 *spt;
        /* hold the gfn of each spte inside spt */
        gfn_t *gfns;
-       int root_count;          /* Currently serving as active root */
+       /* Currently serving as active root */
+       union {
+               int root_count;
+               refcount_t tdp_mmu_root_count;
+       };
        unsigned int unsync_children;
        struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
        DECLARE_BITMAP(unsync_child_bitmap, 512);
@@ -78,9 +92,14 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
        return to_shadow_page(__pa(sptep));
 }
 
+static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role)
+{
+       return role.smm ? 1 : 0;
+}
+
 static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
 {
-       return sp->role.smm ? 1 : 0;
+       return kvm_mmu_role_as_id(sp->role);
 }
 
 static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
@@ -108,22 +127,6 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
                                        u64 start_gfn, u64 pages);
 
-static inline void kvm_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-       BUG_ON(!sp->root_count);
-       lockdep_assert_held(&kvm->mmu_lock);
-
-       ++sp->root_count;
-}
-
-static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-       lockdep_assert_held(&kvm->mmu_lock);
-       --sp->root_count;
-
-       return !sp->root_count;
-}
-
 /*
  * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault().
  *
@@ -146,8 +149,9 @@ enum {
 #define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
 #define SET_SPTE_SPURIOUS              BIT(2)
 
-int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
-                             gfn_t gfn, kvm_pfn_t pfn, int max_level);
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+                             const struct kvm_memory_slot *slot, gfn_t gfn,
+                             kvm_pfn_t pfn, int max_level);
 int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
                            int max_level, kvm_pfn_t *pfnp,
                            bool huge_page_disallowed, int *req_level);
index 55d7b47..70b7e44 100644 (file)
@@ -503,6 +503,7 @@ error:
 #endif
        walker->fault.address = addr;
        walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
+       walker->fault.async_page_fault = false;
 
        trace_kvm_mmu_walker_error(walker->fault.error_code);
        return 0;
@@ -1084,7 +1085,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
                nr_present++;
 
-               host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
+               host_writable = sp->spt[i] & shadow_host_writable_mask;
 
                set_spte_ret |= set_spte(vcpu, &sp->spt[i],
                                         pte_access, PG_LEVEL_4K,
index ef55f0b..66d43ce 100644 (file)
 #include "spte.h"
 
 #include <asm/e820/api.h>
+#include <asm/vmx.h>
 
+static bool __read_mostly enable_mmio_caching = true;
+module_param_named(mmio_caching, enable_mmio_caching, bool, 0444);
+
+u64 __read_mostly shadow_host_writable_mask;
+u64 __read_mostly shadow_mmu_writable_mask;
 u64 __read_mostly shadow_nx_mask;
 u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
 u64 __read_mostly shadow_user_mask;
 u64 __read_mostly shadow_accessed_mask;
 u64 __read_mostly shadow_dirty_mask;
 u64 __read_mostly shadow_mmio_value;
+u64 __read_mostly shadow_mmio_mask;
 u64 __read_mostly shadow_mmio_access_mask;
 u64 __read_mostly shadow_present_mask;
 u64 __read_mostly shadow_me_mask;
@@ -38,7 +45,6 @@ static u64 generation_mmio_spte_mask(u64 gen)
        u64 mask;
 
        WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
-       BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
 
        mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
        mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
@@ -48,16 +54,18 @@ static u64 generation_mmio_spte_mask(u64 gen)
 u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
 {
        u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
-       u64 mask = generation_mmio_spte_mask(gen);
+       u64 spte = generation_mmio_spte_mask(gen);
        u64 gpa = gfn << PAGE_SHIFT;
 
+       WARN_ON_ONCE(!shadow_mmio_value);
+
        access &= shadow_mmio_access_mask;
-       mask |= shadow_mmio_value | access;
-       mask |= gpa | shadow_nonpresent_or_rsvd_mask;
-       mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
+       spte |= shadow_mmio_value | access;
+       spte |= gpa | shadow_nonpresent_or_rsvd_mask;
+       spte |= (gpa & shadow_nonpresent_or_rsvd_mask)
                << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
 
-       return mask;
+       return spte;
 }
 
 static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
@@ -86,13 +94,20 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
                     bool can_unsync, bool host_writable, bool ad_disabled,
                     u64 *new_spte)
 {
-       u64 spte = 0;
+       u64 spte = SPTE_MMU_PRESENT_MASK;
        int ret = 0;
 
        if (ad_disabled)
-               spte |= SPTE_AD_DISABLED_MASK;
+               spte |= SPTE_TDP_AD_DISABLED_MASK;
        else if (kvm_vcpu_ad_need_write_protect(vcpu))
-               spte |= SPTE_AD_WRPROT_ONLY_MASK;
+               spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK;
+
+       /*
+        * Bits 62:52 of PAE SPTEs are reserved.  WARN if said bits are set
+        * if PAE paging may be employed (shadow paging or any 32-bit KVM).
+        */
+       WARN_ON_ONCE((!tdp_enabled || !IS_ENABLED(CONFIG_X86_64)) &&
+                    (spte & SPTE_TDP_AD_MASK));
 
        /*
         * For the EPT case, shadow_present_mask is 0 if hardware
@@ -124,7 +139,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
                        kvm_is_mmio_pfn(pfn));
 
        if (host_writable)
-               spte |= SPTE_HOST_WRITEABLE;
+               spte |= shadow_host_writable_mask;
        else
                pte_access &= ~ACC_WRITE_MASK;
 
@@ -134,7 +149,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
        spte |= (u64)pfn << PAGE_SHIFT;
 
        if (pte_access & ACC_WRITE_MASK) {
-               spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
+               spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask;
 
                /*
                 * Optimization: for pte sync, if spte was writable the hash
@@ -150,7 +165,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
                                 __func__, gfn);
                        ret |= SET_SPTE_WRITE_PROTECTED_PT;
                        pte_access &= ~ACC_WRITE_MASK;
-                       spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
+                       spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
                }
        }
 
@@ -161,19 +176,20 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
                spte = mark_spte_for_access_track(spte);
 
 out:
+       WARN_ON(is_mmio_spte(spte));
        *new_spte = spte;
        return ret;
 }
 
 u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
 {
-       u64 spte;
+       u64 spte = SPTE_MMU_PRESENT_MASK;
 
-       spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
-              shadow_user_mask | shadow_x_mask | shadow_me_mask;
+       spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
+               shadow_user_mask | shadow_x_mask | shadow_me_mask;
 
        if (ad_disabled)
-               spte |= SPTE_AD_DISABLED_MASK;
+               spte |= SPTE_TDP_AD_DISABLED_MASK;
        else
                spte |= shadow_accessed_mask;
 
@@ -188,7 +204,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
        new_spte |= (u64)new_pfn << PAGE_SHIFT;
 
        new_spte &= ~PT_WRITABLE_MASK;
-       new_spte &= ~SPTE_HOST_WRITEABLE;
+       new_spte &= ~shadow_host_writable_mask;
 
        new_spte = mark_spte_for_access_track(new_spte);
 
@@ -242,53 +258,68 @@ u64 mark_spte_for_access_track(u64 spte)
        return spte;
 }
 
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
 {
        BUG_ON((u64)(unsigned)access_mask != access_mask);
-       WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN));
        WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
-       shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
+
+       if (!enable_mmio_caching)
+               mmio_value = 0;
+
+       /*
+        * Disable MMIO caching if the MMIO value collides with the bits that
+        * are used to hold the relocated GFN when the L1TF mitigation is
+        * enabled.  This should never fire as there is no known hardware that
+        * can trigger this condition, e.g. SME/SEV CPUs that require a custom
+        * MMIO value are not susceptible to L1TF.
+        */
+       if (WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask <<
+                                 SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)))
+               mmio_value = 0;
+
+       /*
+        * The masked MMIO value must obviously match itself and a removed SPTE
+        * must not get a false positive.  Removed SPTEs and MMIO SPTEs should
+        * never collide as MMIO must set some RWX bits, and removed SPTEs must
+        * not set any RWX bits.
+        */
+       if (WARN_ON((mmio_value & mmio_mask) != mmio_value) ||
+           WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value))
+               mmio_value = 0;
+
+       shadow_mmio_value = mmio_value;
+       shadow_mmio_mask  = mmio_mask;
        shadow_mmio_access_mask = access_mask;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
-/*
- * Sets the shadow PTE masks used by the MMU.
- *
- * Assumptions:
- *  - Setting either @accessed_mask or @dirty_mask requires setting both
- *  - At least one of @accessed_mask or @acc_track_mask must be set
- */
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-               u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
-               u64 acc_track_mask, u64 me_mask)
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
 {
-       BUG_ON(!dirty_mask != !accessed_mask);
-       BUG_ON(!accessed_mask && !acc_track_mask);
-       BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
-
-       shadow_user_mask = user_mask;
-       shadow_accessed_mask = accessed_mask;
-       shadow_dirty_mask = dirty_mask;
-       shadow_nx_mask = nx_mask;
-       shadow_x_mask = x_mask;
-       shadow_present_mask = p_mask;
-       shadow_acc_track_mask = acc_track_mask;
-       shadow_me_mask = me_mask;
+       shadow_user_mask        = VMX_EPT_READABLE_MASK;
+       shadow_accessed_mask    = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull;
+       shadow_dirty_mask       = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
+       shadow_nx_mask          = 0ull;
+       shadow_x_mask           = VMX_EPT_EXECUTABLE_MASK;
+       shadow_present_mask     = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK;
+       shadow_acc_track_mask   = VMX_EPT_RWX_MASK;
+       shadow_me_mask          = 0ull;
+
+       shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
+       shadow_mmu_writable_mask  = EPT_SPTE_MMU_WRITABLE;
+
+       /*
+        * EPT Misconfigurations are generated if the value of bits 2:0
+        * of an EPT paging-structure entry is 110b (write/execute).
+        */
+       kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE,
+                                  VMX_EPT_RWX_MASK, 0);
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
+EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks);
 
 void kvm_mmu_reset_all_pte_masks(void)
 {
        u8 low_phys_bits;
-
-       shadow_user_mask = 0;
-       shadow_accessed_mask = 0;
-       shadow_dirty_mask = 0;
-       shadow_nx_mask = 0;
-       shadow_x_mask = 0;
-       shadow_present_mask = 0;
-       shadow_acc_track_mask = 0;
+       u64 mask;
 
        shadow_phys_bits = kvm_get_shadow_phys_bits();
 
@@ -315,4 +346,30 @@ void kvm_mmu_reset_all_pte_masks(void)
 
        shadow_nonpresent_or_rsvd_lower_gfn_mask =
                GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
+
+       shadow_user_mask        = PT_USER_MASK;
+       shadow_accessed_mask    = PT_ACCESSED_MASK;
+       shadow_dirty_mask       = PT_DIRTY_MASK;
+       shadow_nx_mask          = PT64_NX_MASK;
+       shadow_x_mask           = 0;
+       shadow_present_mask     = PT_PRESENT_MASK;
+       shadow_acc_track_mask   = 0;
+       shadow_me_mask          = sme_me_mask;
+
+       shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITEABLE;
+       shadow_mmu_writable_mask  = DEFAULT_SPTE_MMU_WRITEABLE;
+
+       /*
+        * Set a reserved PA bit in MMIO SPTEs to generate page faults with
+        * PFEC.RSVD=1 on MMIO accesses.  64-bit PTEs (PAE, x86-64, and EPT
+        * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
+        * 52-bit physical addresses then there are no reserved PA bits in the
+        * PTEs and so the reserved PA approach must be disabled.
+        */
+       if (shadow_phys_bits < 52)
+               mask = BIT_ULL(51) | PT_PRESENT_MASK;
+       else
+               mask = 0;
+
+       kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
 }
index 6de3950..bca0ba1 100644 (file)
@@ -5,18 +5,33 @@
 
 #include "mmu_internal.h"
 
-#define PT_FIRST_AVAIL_BITS_SHIFT 10
-#define PT64_SECOND_AVAIL_BITS_SHIFT 54
+/*
+ * A MMU present SPTE is backed by actual memory and may or may not be present
+ * in hardware.  E.g. MMIO SPTEs are not considered present.  Use bit 11, as it
+ * is ignored by all flavors of SPTEs and checking a low bit often generates
+ * better code than for a high bit, e.g. 56+.  MMU present checks are pervasive
+ * enough that the improved code generation is noticeable in KVM's footprint.
+ */
+#define SPTE_MMU_PRESENT_MASK          BIT_ULL(11)
 
 /*
- * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
- * Access Tracking SPTEs.
+ * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also
+ * be restricted to using write-protection (for L2 when CPU dirty logging, i.e.
+ * PML, is enabled).  Use bits 52 and 53 to hold the type of A/D tracking that
+ * is must be employed for a given TDP SPTE.
+ *
+ * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE
+ * paging, including NPT PAE.  This scheme works because legacy shadow paging
+ * is guaranteed to have A/D bits and write-protection is forced only for
+ * TDP with CPU dirty logging (PML).  If NPT ever gains PML-like support, it
+ * must be restricted to 64-bit KVM.
  */
-#define SPTE_SPECIAL_MASK (3ULL << 52)
-#define SPTE_AD_ENABLED_MASK (0ULL << 52)
-#define SPTE_AD_DISABLED_MASK (1ULL << 52)
-#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
-#define SPTE_MMIO_MASK (3ULL << 52)
+#define SPTE_TDP_AD_SHIFT              52
+#define SPTE_TDP_AD_MASK               (3ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_ENABLED_MASK       (0ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_DISABLED_MASK      (1ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_WRPROT_ONLY_MASK   (2ULL << SPTE_TDP_AD_SHIFT)
+static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
 
 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
 #define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
        (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
+/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
+#define DEFAULT_SPTE_HOST_WRITEABLE    BIT_ULL(9)
+#define DEFAULT_SPTE_MMU_WRITEABLE     BIT_ULL(10)
+
+/*
+ * The mask/shift to use for saving the original R/X bits when marking the PTE
+ * as not-present for access tracking purposes. We do not save the W bit as the
+ * PTEs being access tracked also need to be dirty tracked, so the W bit will be
+ * restored only when a write is attempted to the page.  This mask obviously
+ * must not overlap the A/D type mask.
+ */
+#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
+                                         PT64_EPT_EXECUTABLE_MASK)
+#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54
+#define SHADOW_ACC_TRACK_SAVED_MASK    (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \
+                                        SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/*
+ * Low ignored bits are at a premium for EPT, use high ignored bits, taking care
+ * to not overlap the A/D type mask or the saved access bits of access-tracked
+ * SPTEs when A/D bits are disabled.
+ */
+#define EPT_SPTE_HOST_WRITABLE         BIT_ULL(57)
+#define EPT_SPTE_MMU_WRITABLE          BIT_ULL(58)
 
-#define SPTE_HOST_WRITEABLE    (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-#define SPTE_MMU_WRITEABLE     (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/* Defined only to keep the above static asserts readable. */
+#undef SHADOW_ACC_TRACK_SAVED_MASK
 
 /*
- * Due to limited space in PTEs, the MMIO generation is a 18 bit subset of
+ * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
  * the memslots generation and is derived as follows:
  *
- * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
- * Bits 9-17 of the MMIO generation are propagated to spte bits 54-62
+ * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10
+ * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62
  *
  * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
  * the MMIO generation number, as doing so would require stealing a bit from
  */
 
 #define MMIO_SPTE_GEN_LOW_START                3
-#define MMIO_SPTE_GEN_LOW_END          11
+#define MMIO_SPTE_GEN_LOW_END          10
 
-#define MMIO_SPTE_GEN_HIGH_START       PT64_SECOND_AVAIL_BITS_SHIFT
+#define MMIO_SPTE_GEN_HIGH_START       52
 #define MMIO_SPTE_GEN_HIGH_END         62
 
 #define MMIO_SPTE_GEN_LOW_MASK         GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
                                                    MMIO_SPTE_GEN_LOW_START)
 #define MMIO_SPTE_GEN_HIGH_MASK                GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
                                                    MMIO_SPTE_GEN_HIGH_START)
+static_assert(!(SPTE_MMU_PRESENT_MASK &
+               (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK)));
 
 #define MMIO_SPTE_GEN_LOW_BITS         (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1)
 #define MMIO_SPTE_GEN_HIGH_BITS                (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)
 
 /* remember to adjust the comment above as well if you change these */
-static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 9);
+static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
 
 #define MMIO_SPTE_GEN_LOW_SHIFT                (MMIO_SPTE_GEN_LOW_START - 0)
 #define MMIO_SPTE_GEN_HIGH_SHIFT       (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)
 
 #define MMIO_SPTE_GEN_MASK             GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
 
+extern u64 __read_mostly shadow_host_writable_mask;
+extern u64 __read_mostly shadow_mmu_writable_mask;
 extern u64 __read_mostly shadow_nx_mask;
 extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
 extern u64 __read_mostly shadow_user_mask;
 extern u64 __read_mostly shadow_accessed_mask;
 extern u64 __read_mostly shadow_dirty_mask;
 extern u64 __read_mostly shadow_mmio_value;
+extern u64 __read_mostly shadow_mmio_mask;
 extern u64 __read_mostly shadow_mmio_access_mask;
 extern u64 __read_mostly shadow_present_mask;
 extern u64 __read_mostly shadow_me_mask;
 
 /*
- * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
+ * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK;
  * shadow_acc_track_mask is the set of bits to be cleared in non-accessed
  * pages.
  */
@@ -120,29 +170,22 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
  */
 #define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5
 
-/*
- * The mask/shift to use for saving the original R/X bits when marking the PTE
- * as not-present for access tracking purposes. We do not save the W bit as the
- * PTEs being access tracked also need to be dirty tracked, so the W bit will be
- * restored only when a write is attempted to the page.
- */
-#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
-                                         PT64_EPT_EXECUTABLE_MASK)
-#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT
-
 /*
  * If a thread running without exclusive control of the MMU lock must perform a
  * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a
  * non-present intermediate value. Other threads which encounter this value
  * should not modify the SPTE.
  *
- * This constant works because it is considered non-present on both AMD and
- * Intel CPUs and does not create a L1TF vulnerability because the pfn section
- * is zeroed out.
+ * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
+ * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
+ * vulnerability.  Use only low bits to avoid 64-bit immediates.
  *
  * Only used by the TDP MMU.
  */
-#define REMOVED_SPTE (1ull << 59)
+#define REMOVED_SPTE   0x5a0ULL
+
+/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
+static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
 
 static inline bool is_removed_spte(u64 spte)
 {
@@ -167,7 +210,13 @@ extern u8 __read_mostly shadow_phys_bits;
 
 static inline bool is_mmio_spte(u64 spte)
 {
-       return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
+       return (spte & shadow_mmio_mask) == shadow_mmio_value &&
+              likely(shadow_mmio_value);
+}
+
+static inline bool is_shadow_present_pte(u64 pte)
+{
+       return !!(pte & SPTE_MMU_PRESENT_MASK);
 }
 
 static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
@@ -177,25 +226,30 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
 
 static inline bool spte_ad_enabled(u64 spte)
 {
-       MMU_WARN_ON(is_mmio_spte(spte));
-       return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
+       MMU_WARN_ON(!is_shadow_present_pte(spte));
+       return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK;
 }
 
 static inline bool spte_ad_need_write_protect(u64 spte)
 {
-       MMU_WARN_ON(is_mmio_spte(spte));
-       return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
+       MMU_WARN_ON(!is_shadow_present_pte(spte));
+       /*
+        * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0',
+        * and non-TDP SPTEs will never set these bits.  Optimize for 64-bit
+        * TDP and do the A/D type check unconditionally.
+        */
+       return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED_MASK;
 }
 
 static inline u64 spte_shadow_accessed_mask(u64 spte)
 {
-       MMU_WARN_ON(is_mmio_spte(spte));
+       MMU_WARN_ON(!is_shadow_present_pte(spte));
        return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
 }
 
 static inline u64 spte_shadow_dirty_mask(u64 spte)
 {
-       MMU_WARN_ON(is_mmio_spte(spte));
+       MMU_WARN_ON(!is_shadow_present_pte(spte));
        return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
 }
 
@@ -204,11 +258,6 @@ static inline bool is_access_track_spte(u64 spte)
        return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
 }
 
-static inline bool is_shadow_present_pte(u64 pte)
-{
-       return (pte != 0) && !is_mmio_spte(pte) && !is_removed_spte(pte);
-}
-
 static inline bool is_large_pte(u64 pte)
 {
        return pte & PT_PAGE_SIZE_MASK;
@@ -246,8 +295,8 @@ static inline bool is_dirty_spte(u64 spte)
 
 static inline bool spte_can_locklessly_be_made_writable(u64 spte)
 {
-       return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
-               (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
+       return (spte & shadow_host_writable_mask) &&
+              (spte & shadow_mmu_writable_mask);
 }
 
 static inline u64 get_mmio_spte_generation(u64 spte)
index 018d82e..83cbdbe 100644 (file)
@@ -27,6 +27,15 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
        INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
 }
 
+static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
+                                                            bool shared)
+{
+       if (shared)
+               lockdep_assert_held_read(&kvm->mmu_lock);
+       else
+               lockdep_assert_held_write(&kvm->mmu_lock);
+}
+
 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
 {
        if (!kvm->arch.tdp_mmu_enabled)
@@ -41,32 +50,85 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
        rcu_barrier();
 }
 
-static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
+static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
+                         gfn_t start, gfn_t end, bool can_yield, bool flush,
+                         bool shared);
+
+static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
 {
-       if (kvm_mmu_put_root(kvm, root))
-               kvm_tdp_mmu_free_root(kvm, root);
+       free_page((unsigned long)sp->spt);
+       kmem_cache_free(mmu_page_header_cache, sp);
 }
 
-static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
-                                          struct kvm_mmu_page *root)
+/*
+ * This is called through call_rcu in order to free TDP page table memory
+ * safely with respect to other kernel threads that may be operating on
+ * the memory.
+ * By only accessing TDP MMU page table memory in an RCU read critical
+ * section, and freeing it after a grace period, lockless access to that
+ * memory won't use it after it is freed.
+ */
+static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
 {
-       lockdep_assert_held_write(&kvm->mmu_lock);
+       struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
+                                              rcu_head);
 
-       if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
-               return false;
+       tdp_mmu_free_sp(sp);
+}
 
-       kvm_mmu_get_root(kvm, root);
-       return true;
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+                         bool shared)
+{
+       gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+
+       kvm_lockdep_assert_mmu_lock_held(kvm, shared);
 
+       if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
+               return;
+
+       WARN_ON(!root->tdp_mmu_page);
+
+       spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+       list_del_rcu(&root->link);
+       spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+
+       zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
+
+       call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
 }
 
-static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
-                                                    struct kvm_mmu_page *root)
+/*
+ * Finds the next valid root after root (or the first valid root if root
+ * is NULL), takes a reference on it, and returns that next root. If root
+ * is not NULL, this thread should have already taken a reference on it, and
+ * that reference will be dropped. If no valid root is found, this
+ * function will return NULL.
+ */
+static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
+                                             struct kvm_mmu_page *prev_root,
+                                             bool shared)
 {
        struct kvm_mmu_page *next_root;
 
-       next_root = list_next_entry(root, link);
-       tdp_mmu_put_root(kvm, root);
+       rcu_read_lock();
+
+       if (prev_root)
+               next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+                                                 &prev_root->link,
+                                                 typeof(*prev_root), link);
+       else
+               next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+                                                  typeof(*next_root), link);
+
+       while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
+               next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+                               &next_root->link, typeof(*next_root), link);
+
+       rcu_read_unlock();
+
+       if (prev_root)
+               kvm_tdp_mmu_put_root(kvm, prev_root, shared);
+
        return next_root;
 }
 
@@ -75,35 +137,24 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
  * This makes it safe to release the MMU lock and yield within the loop, but
  * if exiting the loop early, the caller must drop the reference to the most
  * recent root. (Unless keeping a live reference is desirable.)
+ *
+ * If shared is set, this function is operating under the MMU lock in read
+ * mode. In the unlikely event that this thread must free a root, the lock
+ * will be temporarily dropped and reacquired in write mode.
  */
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root)                          \
-       for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots,        \
-                                     typeof(*_root), link);            \
-            tdp_mmu_next_root_valid(_kvm, _root);                      \
-            _root = tdp_mmu_next_root(_kvm, _root))
-
-#define for_each_tdp_mmu_root(_kvm, _root)                             \
-       list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
-
-static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
-                         gfn_t start, gfn_t end, bool can_yield, bool flush);
-
-void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
-{
-       gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
-
-       lockdep_assert_held_write(&kvm->mmu_lock);
-
-       WARN_ON(root->root_count);
-       WARN_ON(!root->tdp_mmu_page);
-
-       list_del(&root->link);
-
-       zap_gfn_range(kvm, root, 0, max_gfn, false, false);
-
-       free_page((unsigned long)root->spt);
-       kmem_cache_free(mmu_page_header_cache, root);
-}
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
+       for (_root = tdp_mmu_next_root(_kvm, NULL, _shared);            \
+            _root;                                                     \
+            _root = tdp_mmu_next_root(_kvm, _root, _shared))           \
+               if (kvm_mmu_page_as_id(_root) != _as_id) {              \
+               } else
+
+#define for_each_tdp_mmu_root(_kvm, _root, _as_id)                             \
+       list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link,         \
+                               lockdep_is_held_type(&kvm->mmu_lock, 0) ||      \
+                               lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \
+               if (kvm_mmu_page_as_id(_root) != _as_id) {              \
+               } else
 
 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
                                                   int level)
@@ -137,81 +188,46 @@ static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
        return sp;
 }
 
-static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
+hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 {
        union kvm_mmu_page_role role;
        struct kvm *kvm = vcpu->kvm;
        struct kvm_mmu_page *root;
 
-       role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
+       lockdep_assert_held_write(&kvm->mmu_lock);
 
-       write_lock(&kvm->mmu_lock);
+       role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 
        /* Check for an existing root before allocating a new one. */
-       for_each_tdp_mmu_root(kvm, root) {
-               if (root->role.word == role.word) {
-                       kvm_mmu_get_root(kvm, root);
-                       write_unlock(&kvm->mmu_lock);
-                       return root;
-               }
+       for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
+               if (root->role.word == role.word &&
+                   kvm_tdp_mmu_get_root(kvm, root))
+                       goto out;
        }
 
        root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
-       root->root_count = 1;
-
-       list_add(&root->link, &kvm->arch.tdp_mmu_roots);
-
-       write_unlock(&kvm->mmu_lock);
-
-       return root;
-}
-
-hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
-{
-       struct kvm_mmu_page *root;
+       refcount_set(&root->tdp_mmu_root_count, 1);
 
-       root = get_tdp_mmu_vcpu_root(vcpu);
-       if (!root)
-               return INVALID_PAGE;
+       spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+       list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
+       spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 
+out:
        return __pa(root->spt);
 }
 
-static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
-{
-       free_page((unsigned long)sp->spt);
-       kmem_cache_free(mmu_page_header_cache, sp);
-}
-
-/*
- * This is called through call_rcu in order to free TDP page table memory
- * safely with respect to other kernel threads that may be operating on
- * the memory.
- * By only accessing TDP MMU page table memory in an RCU read critical
- * section, and freeing it after a grace period, lockless access to that
- * memory won't use it after it is freed.
- */
-static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
-{
-       struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
-                                              rcu_head);
-
-       tdp_mmu_free_sp(sp);
-}
-
 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
                                u64 old_spte, u64 new_spte, int level,
                                bool shared);
 
 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
 {
-       bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
-
        if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
                return;
 
        if (is_accessed_spte(old_spte) &&
-           (!is_accessed_spte(new_spte) || pfn_changed))
+           (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
+            spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
                kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 }
 
@@ -455,7 +471,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 
 
        if (was_leaf && is_dirty_spte(old_spte) &&
-           (!is_dirty_spte(new_spte) || pfn_changed))
+           (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
                kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 
        /*
@@ -479,8 +495,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 }
 
 /*
- * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
- * associated bookkeeping
+ * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
+ * and handle the associated bookkeeping, but do not mark the page dirty
+ * in KVM's dirty bitmaps.
  *
  * @kvm: kvm instance
  * @iter: a tdp_iter instance currently on the SPTE that should be set
@@ -488,9 +505,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
  * Returns: true if the SPTE was set, false if it was not. If false is returned,
  *         this function will have no side-effects.
  */
-static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
-                                          struct tdp_iter *iter,
-                                          u64 new_spte)
+static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
+                                                       struct tdp_iter *iter,
+                                                       u64 new_spte)
 {
        lockdep_assert_held_read(&kvm->mmu_lock);
 
@@ -498,19 +515,32 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
         * Do not change removed SPTEs. Only the thread that froze the SPTE
         * may modify it.
         */
-       if (iter->old_spte == REMOVED_SPTE)
+       if (is_removed_spte(iter->old_spte))
                return false;
 
        if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
                      new_spte) != iter->old_spte)
                return false;
 
-       handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
-                           new_spte, iter->level, true);
+       __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+                             new_spte, iter->level, true);
+       handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
 
        return true;
 }
 
+static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
+                                          struct tdp_iter *iter,
+                                          u64 new_spte)
+{
+       if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
+               return false;
+
+       handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
+                                     iter->old_spte, new_spte, iter->level);
+       return true;
+}
+
 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
                                           struct tdp_iter *iter)
 {
@@ -569,7 +599,7 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
         * should be used. If operating under the MMU lock in write mode, the
         * use of the removed SPTE should not be necessary.
         */
-       WARN_ON(iter->old_spte == REMOVED_SPTE);
+       WARN_ON(is_removed_spte(iter->old_spte));
 
        WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 
@@ -634,7 +664,8 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
  * Return false if a yield was not needed.
  */
 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
-                                            struct tdp_iter *iter, bool flush)
+                                            struct tdp_iter *iter, bool flush,
+                                            bool shared)
 {
        /* Ensure forward progress has been made before yielding. */
        if (iter->next_last_level_gfn == iter->yielded_gfn)
@@ -646,7 +677,11 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
                if (flush)
                        kvm_flush_remote_tlbs(kvm);
 
-               cond_resched_rwlock_write(&kvm->mmu_lock);
+               if (shared)
+                       cond_resched_rwlock_read(&kvm->mmu_lock);
+               else
+                       cond_resched_rwlock_write(&kvm->mmu_lock);
+
                rcu_read_lock();
 
                WARN_ON(iter->gfn > iter->next_last_level_gfn);
@@ -664,24 +699,32 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
  * non-root pages mapping GFNs strictly within that range. Returns true if
  * SPTEs have been cleared and a TLB flush is needed before releasing the
  * MMU lock.
+ *
  * If can_yield is true, will release the MMU lock and reschedule if the
  * scheduler needs the CPU or there is contention on the MMU lock. If this
  * function cannot yield, it will not release the MMU lock or reschedule and
  * the caller must ensure it does not supply too large a GFN range, or the
- * operation can cause a soft lockup.  Note, in some use cases a flush may be
- * required by prior actions.  Ensure the pending flush is performed prior to
- * yielding.
+ * operation can cause a soft lockup.
+ *
+ * If shared is true, this thread holds the MMU lock in read mode and must
+ * account for the possibility that other threads are modifying the paging
+ * structures concurrently. If shared is false, this thread should hold the
+ * MMU lock in write mode.
  */
 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
-                         gfn_t start, gfn_t end, bool can_yield, bool flush)
+                         gfn_t start, gfn_t end, bool can_yield, bool flush,
+                         bool shared)
 {
        struct tdp_iter iter;
 
+       kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
        rcu_read_lock();
 
        tdp_root_for_each_pte(iter, root, start, end) {
+retry:
                if (can_yield &&
-                   tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
+                   tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
                        flush = false;
                        continue;
                }
@@ -699,8 +742,17 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
                    !is_last_spte(iter.old_spte, iter.level))
                        continue;
 
-               tdp_mmu_set_spte(kvm, &iter, 0);
-               flush = true;
+               if (!shared) {
+                       tdp_mmu_set_spte(kvm, &iter, 0);
+                       flush = true;
+               } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
+                       /*
+                        * The iter must explicitly re-read the SPTE because
+                        * the atomic cmpxchg failed.
+                        */
+                       iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+                       goto retry;
+               }
        }
 
        rcu_read_unlock();
@@ -712,15 +764,21 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  * non-root pages mapping GFNs strictly within that range. Returns true if
  * SPTEs have been cleared and a TLB flush is needed before releasing the
  * MMU lock.
+ *
+ * If shared is true, this thread holds the MMU lock in read mode and must
+ * account for the possibility that other threads are modifying the paging
+ * structures concurrently. If shared is false, this thread should hold the
+ * MMU in write mode.
  */
-bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
-                                bool can_yield)
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
+                                gfn_t end, bool can_yield, bool flush,
+                                bool shared)
 {
        struct kvm_mmu_page *root;
-       bool flush = false;
 
-       for_each_tdp_mmu_root_yield_safe(kvm, root)
-               flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
+       for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
+               flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
+                                     shared);
 
        return flush;
 }
@@ -728,13 +786,115 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 {
        gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
-       bool flush;
+       bool flush = false;
+       int i;
+
+       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+               flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
+                                                 flush, false);
+
+       if (flush)
+               kvm_flush_remote_tlbs(kvm);
+}
+
+static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
+                                                 struct kvm_mmu_page *prev_root)
+{
+       struct kvm_mmu_page *next_root;
+
+       if (prev_root)
+               next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+                                                 &prev_root->link,
+                                                 typeof(*prev_root), link);
+       else
+               next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+                                                  typeof(*next_root), link);
+
+       while (next_root && !(next_root->role.invalid &&
+                             refcount_read(&next_root->tdp_mmu_root_count)))
+               next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+                                                 &next_root->link,
+                                                 typeof(*next_root), link);
+
+       return next_root;
+}
+
+/*
+ * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
+ * invalidated root, they will not be freed until this function drops the
+ * reference. Before dropping that reference, tear down the paging
+ * structure so that whichever thread does drop the last reference
+ * only has to do a trivial amount of work. Since the roots are invalid,
+ * no new SPTEs should be created under them.
+ */
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
+{
+       gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+       struct kvm_mmu_page *next_root;
+       struct kvm_mmu_page *root;
+       bool flush = false;
+
+       lockdep_assert_held_read(&kvm->mmu_lock);
+
+       rcu_read_lock();
+
+       root = next_invalidated_root(kvm, NULL);
+
+       while (root) {
+               next_root = next_invalidated_root(kvm, root);
+
+               rcu_read_unlock();
+
+               flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
+                                     true);
+
+               /*
+                * Put the reference acquired in
+                * kvm_tdp_mmu_invalidate_roots
+                */
+               kvm_tdp_mmu_put_root(kvm, root, true);
+
+               root = next_root;
+
+               rcu_read_lock();
+       }
+
+       rcu_read_unlock();
 
-       flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
        if (flush)
                kvm_flush_remote_tlbs(kvm);
 }
 
+/*
+ * Mark each TDP MMU root as invalid so that other threads
+ * will drop their references and allow the root count to
+ * go to 0.
+ *
+ * Also take a reference on all roots so that this thread
+ * can do the bulk of the work required to free the roots
+ * once they are invalidated. Without this reference, a
+ * vCPU thread might drop the last reference to a root and
+ * get stuck with tearing down the entire paging structure.
+ *
+ * Roots which have a zero refcount should be skipped as
+ * they're already being torn down.
+ * Already invalid roots should be referenced again so that
+ * they aren't freed before kvm_tdp_mmu_zap_all_fast is
+ * done with them.
+ *
+ * This has essentially the same effect for the TDP MMU
+ * as updating mmu_valid_gen does for the shadow MMU.
+ */
+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
+{
+       struct kvm_mmu_page *root;
+
+       lockdep_assert_held_write(&kvm->mmu_lock);
+       list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
+               if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
+                       root->role.invalid = true;
+}
+
 /*
  * Installs a last-level SPTE to handle a TDP page fault.
  * (NPT/EPT violation/misconfiguration)
@@ -777,12 +937,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
                trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
                                     new_spte);
                ret = RET_PF_EMULATE;
-       } else
+       } else {
                trace_kvm_mmu_set_spte(iter->level, iter->gfn,
                                       rcu_dereference(iter->sptep));
+       }
 
-       trace_kvm_mmu_set_spte(iter->level, iter->gfn,
-                              rcu_dereference(iter->sptep));
        if (!prefault)
                vcpu->stat.pf_fixed++;
 
@@ -882,199 +1041,139 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
        return ret;
 }
 
-static __always_inline int
-kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
-                            unsigned long start,
-                            unsigned long end,
-                            unsigned long data,
-                            int (*handler)(struct kvm *kvm,
-                                           struct kvm_memory_slot *slot,
-                                           struct kvm_mmu_page *root,
-                                           gfn_t start,
-                                           gfn_t end,
-                                           unsigned long data))
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+                                bool flush)
 {
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
        struct kvm_mmu_page *root;
-       int ret = 0;
-       int as_id;
-
-       for_each_tdp_mmu_root_yield_safe(kvm, root) {
-               as_id = kvm_mmu_page_as_id(root);
-               slots = __kvm_memslots(kvm, as_id);
-               kvm_for_each_memslot(memslot, slots) {
-                       unsigned long hva_start, hva_end;
-                       gfn_t gfn_start, gfn_end;
-
-                       hva_start = max(start, memslot->userspace_addr);
-                       hva_end = min(end, memslot->userspace_addr +
-                                     (memslot->npages << PAGE_SHIFT));
-                       if (hva_start >= hva_end)
-                               continue;
-                       /*
-                        * {gfn(page) | page intersects with [hva_start, hva_end)} =
-                        * {gfn_start, gfn_start+1, ..., gfn_end-1}.
-                        */
-                       gfn_start = hva_to_gfn_memslot(hva_start, memslot);
-                       gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 
-                       ret |= handler(kvm, memslot, root, gfn_start,
-                                      gfn_end, data);
-               }
-       }
+       for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
+               flush |= zap_gfn_range(kvm, root, range->start, range->end,
+                                      range->may_block, flush, false);
 
-       return ret;
+       return flush;
 }
 
-static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
-                                    struct kvm_memory_slot *slot,
-                                    struct kvm_mmu_page *root, gfn_t start,
-                                    gfn_t end, unsigned long unused)
-{
-       return zap_gfn_range(kvm, root, start, end, false, false);
-}
+typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
+                             struct kvm_gfn_range *range);
 
-int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
-                             unsigned long end)
+static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
+                                                  struct kvm_gfn_range *range,
+                                                  tdp_handler_t handler)
 {
-       return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
-                                           zap_gfn_range_hva_wrapper);
+       struct kvm_mmu_page *root;
+       struct tdp_iter iter;
+       bool ret = false;
+
+       rcu_read_lock();
+
+       /*
+        * Don't support rescheduling, none of the MMU notifiers that funnel
+        * into this helper allow blocking; it'd be dead, wasteful code.
+        */
+       for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
+               tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
+                       ret |= handler(kvm, &iter, range);
+       }
+
+       rcu_read_unlock();
+
+       return ret;
 }
 
 /*
  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
  * if any of the GFNs in the range have been accessed.
  */
-static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
-                        struct kvm_mmu_page *root, gfn_t start, gfn_t end,
-                        unsigned long unused)
+static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
+                         struct kvm_gfn_range *range)
 {
-       struct tdp_iter iter;
-       int young = 0;
        u64 new_spte = 0;
 
-       rcu_read_lock();
+       /* If we have a non-accessed entry we don't need to change the pte. */
+       if (!is_accessed_spte(iter->old_spte))
+               return false;
 
-       tdp_root_for_each_leaf_pte(iter, root, start, end) {
+       new_spte = iter->old_spte;
+
+       if (spte_ad_enabled(new_spte)) {
+               new_spte &= ~shadow_accessed_mask;
+       } else {
                /*
-                * If we have a non-accessed entry we don't need to change the
-                * pte.
+                * Capture the dirty status of the page, so that it doesn't get
+                * lost when the SPTE is marked for access tracking.
                 */
-               if (!is_accessed_spte(iter.old_spte))
-                       continue;
-
-               new_spte = iter.old_spte;
-
-               if (spte_ad_enabled(new_spte)) {
-                       clear_bit((ffs(shadow_accessed_mask) - 1),
-                                 (unsigned long *)&new_spte);
-               } else {
-                       /*
-                        * Capture the dirty status of the page, so that it doesn't get
-                        * lost when the SPTE is marked for access tracking.
-                        */
-                       if (is_writable_pte(new_spte))
-                               kvm_set_pfn_dirty(spte_to_pfn(new_spte));
+               if (is_writable_pte(new_spte))
+                       kvm_set_pfn_dirty(spte_to_pfn(new_spte));
 
-                       new_spte = mark_spte_for_access_track(new_spte);
-               }
-               new_spte &= ~shadow_dirty_mask;
-
-               tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
-               young = 1;
-
-               trace_kvm_age_page(iter.gfn, iter.level, slot, young);
+               new_spte = mark_spte_for_access_track(new_spte);
        }
 
-       rcu_read_unlock();
+       tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
 
-       return young;
+       return true;
 }
 
-int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
-                             unsigned long end)
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
-                                           age_gfn_range);
+       return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
 }
 
-static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
-                       struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
-                       unsigned long unused2)
+static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
+                        struct kvm_gfn_range *range)
 {
-       struct tdp_iter iter;
-
-       tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
-               if (is_accessed_spte(iter.old_spte))
-                       return 1;
-
-       return 0;
+       return is_accessed_spte(iter->old_spte);
 }
 
-int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
-                                           test_age_gfn);
+       return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
 }
 
-/*
- * Handle the changed_pte MMU notifier for the TDP MMU.
- * data is a pointer to the new pte_t mapping the HVA specified by the MMU
- * notifier.
- * Returns non-zero if a flush is needed before releasing the MMU lock.
- */
-static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
-                       struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
-                       unsigned long data)
+static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
+                        struct kvm_gfn_range *range)
 {
-       struct tdp_iter iter;
-       pte_t *ptep = (pte_t *)data;
-       kvm_pfn_t new_pfn;
        u64 new_spte;
-       int need_flush = 0;
-
-       rcu_read_lock();
 
-       WARN_ON(pte_huge(*ptep));
+       /* Huge pages aren't expected to be modified without first being zapped. */
+       WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
 
-       new_pfn = pte_pfn(*ptep);
-
-       tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
-               if (iter.level != PG_LEVEL_4K)
-                       continue;
-
-               if (!is_shadow_present_pte(iter.old_spte))
-                       break;
-
-               tdp_mmu_set_spte(kvm, &iter, 0);
-
-               kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
+       if (iter->level != PG_LEVEL_4K ||
+           !is_shadow_present_pte(iter->old_spte))
+               return false;
 
-               if (!pte_write(*ptep)) {
-                       new_spte = kvm_mmu_changed_pte_notifier_make_spte(
-                                       iter.old_spte, new_pfn);
+       /*
+        * Note, when changing a read-only SPTE, it's not strictly necessary to
+        * zero the SPTE before setting the new PFN, but doing so preserves the
+        * invariant that the PFN of a present * leaf SPTE can never change.
+        * See __handle_changed_spte().
+        */
+       tdp_mmu_set_spte(kvm, iter, 0);
 
-                       tdp_mmu_set_spte(kvm, &iter, new_spte);
-               }
+       if (!pte_write(range->pte)) {
+               new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
+                                                                 pte_pfn(range->pte));
 
-               need_flush = 1;
+               tdp_mmu_set_spte(kvm, iter, new_spte);
        }
 
-       if (need_flush)
-               kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
-
-       rcu_read_unlock();
-
-       return 0;
+       return true;
 }
 
-int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
-                            pte_t *host_ptep)
+/*
+ * Handle the changed_pte MMU notifier for the TDP MMU.
+ * data is a pointer to the new pte_t mapping the HVA specified by the MMU
+ * notifier.
+ * Returns non-zero if a flush is needed before releasing the MMU lock.
+ */
+bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
-                                           (unsigned long)host_ptep,
-                                           set_tdp_spte);
+       bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
+
+       /* FIXME: return 'flush' instead of flushing here. */
+       if (flush)
+               kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
+
+       return false;
 }
 
 /*
@@ -1095,7 +1194,8 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 
        for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
                                   min_level, start, end) {
-               if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+retry:
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
                        continue;
 
                if (!is_shadow_present_pte(iter.old_spte) ||
@@ -1105,7 +1205,15 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 
                new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
 
-               tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
+               if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
+                                                         new_spte)) {
+                       /*
+                        * The iter must explicitly re-read the SPTE because
+                        * the atomic cmpxchg failed.
+                        */
+                       iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+                       goto retry;
+               }
                spte_set = true;
        }
 
@@ -1122,17 +1230,13 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
                             int min_level)
 {
        struct kvm_mmu_page *root;
-       int root_as_id;
        bool spte_set = false;
 
-       for_each_tdp_mmu_root_yield_safe(kvm, root) {
-               root_as_id = kvm_mmu_page_as_id(root);
-               if (root_as_id != slot->as_id)
-                       continue;
+       lockdep_assert_held_read(&kvm->mmu_lock);
 
+       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
                spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
                             slot->base_gfn + slot->npages, min_level);
-       }
 
        return spte_set;
 }
@@ -1154,7 +1258,8 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
        rcu_read_lock();
 
        tdp_root_for_each_leaf_pte(iter, root, start, end) {
-               if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+retry:
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
                        continue;
 
                if (spte_ad_need_write_protect(iter.old_spte)) {
@@ -1169,7 +1274,15 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
                                continue;
                }
 
-               tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
+               if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
+                                                         new_spte)) {
+                       /*
+                        * The iter must explicitly re-read the SPTE because
+                        * the atomic cmpxchg failed.
+                        */
+                       iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+                       goto retry;
+               }
                spte_set = true;
        }
 
@@ -1187,17 +1300,13 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
        struct kvm_mmu_page *root;
-       int root_as_id;
        bool spte_set = false;
 
-       for_each_tdp_mmu_root_yield_safe(kvm, root) {
-               root_as_id = kvm_mmu_page_as_id(root);
-               if (root_as_id != slot->as_id)
-                       continue;
+       lockdep_assert_held_read(&kvm->mmu_lock);
 
+       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
                spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
                                slot->base_gfn + slot->npages);
-       }
 
        return spte_set;
 }
@@ -1259,37 +1368,32 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
                                       bool wrprot)
 {
        struct kvm_mmu_page *root;
-       int root_as_id;
 
        lockdep_assert_held_write(&kvm->mmu_lock);
-       for_each_tdp_mmu_root(kvm, root) {
-               root_as_id = kvm_mmu_page_as_id(root);
-               if (root_as_id != slot->as_id)
-                       continue;
-
+       for_each_tdp_mmu_root(kvm, root, slot->as_id)
                clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
-       }
 }
 
 /*
  * Clear leaf entries which could be replaced by large mappings, for
  * GFNs within the slot.
  */
-static void zap_collapsible_spte_range(struct kvm *kvm,
+static bool zap_collapsible_spte_range(struct kvm *kvm,
                                       struct kvm_mmu_page *root,
-                                      struct kvm_memory_slot *slot)
+                                      const struct kvm_memory_slot *slot,
+                                      bool flush)
 {
        gfn_t start = slot->base_gfn;
        gfn_t end = start + slot->npages;
        struct tdp_iter iter;
        kvm_pfn_t pfn;
-       bool spte_set = false;
 
        rcu_read_lock();
 
        tdp_root_for_each_pte(iter, root, start, end) {
-               if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
-                       spte_set = false;
+retry:
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
+                       flush = false;
                        continue;
                }
 
@@ -1303,38 +1407,43 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
                                                            pfn, PG_LEVEL_NUM))
                        continue;
 
-               tdp_mmu_set_spte(kvm, &iter, 0);
-
-               spte_set = true;
+               if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
+                       /*
+                        * The iter must explicitly re-read the SPTE because
+                        * the atomic cmpxchg failed.
+                        */
+                       iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+                       goto retry;
+               }
+               flush = true;
        }
 
        rcu_read_unlock();
-       if (spte_set)
-               kvm_flush_remote_tlbs(kvm);
+
+       return flush;
 }
 
 /*
  * Clear non-leaf entries (and free associated page tables) which could
  * be replaced by large mappings, for GFNs within the slot.
  */
-void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
-                                      struct kvm_memory_slot *slot)
+bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+                                      const struct kvm_memory_slot *slot,
+                                      bool flush)
 {
        struct kvm_mmu_page *root;
-       int root_as_id;
 
-       for_each_tdp_mmu_root_yield_safe(kvm, root) {
-               root_as_id = kvm_mmu_page_as_id(root);
-               if (root_as_id != slot->as_id)
-                       continue;
+       lockdep_assert_held_read(&kvm->mmu_lock);
 
-               zap_collapsible_spte_range(kvm, root, slot);
-       }
+       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+               flush = zap_collapsible_spte_range(kvm, root, slot, flush);
+
+       return flush;
 }
 
 /*
  * Removes write access on the last level SPTE mapping this GFN and unsets the
- * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
+ * MMU-writable bit to ensure future writes continue to be intercepted.
  * Returns true if an SPTE was set and a TLB flush is needed.
  */
 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
@@ -1351,7 +1460,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
                        break;
 
                new_spte = iter.old_spte &
-                       ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
+                       ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
 
                tdp_mmu_set_spte(kvm, &iter, new_spte);
                spte_set = true;
@@ -1364,24 +1473,19 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
 
 /*
  * Removes write access on the last level SPTE mapping this GFN and unsets the
- * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
+ * MMU-writable bit to ensure future writes continue to be intercepted.
  * Returns true if an SPTE was set and a TLB flush is needed.
  */
 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
                                   struct kvm_memory_slot *slot, gfn_t gfn)
 {
        struct kvm_mmu_page *root;
-       int root_as_id;
        bool spte_set = false;
 
        lockdep_assert_held_write(&kvm->mmu_lock);
-       for_each_tdp_mmu_root(kvm, root) {
-               root_as_id = kvm_mmu_page_as_id(root);
-               if (root_as_id != slot->as_id)
-                       continue;
-
+       for_each_tdp_mmu_root(kvm, root, slot->as_id)
                spte_set |= write_protect_gfn(kvm, root, gfn);
-       }
+
        return spte_set;
 }
 
index 31096ec..5fdf630 100644 (file)
@@ -6,14 +6,28 @@
 #include <linux/kvm_host.h>
 
 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
-void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
 
-bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
-                                bool can_yield);
-static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start,
-                                            gfn_t end)
+__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm,
+                                                    struct kvm_mmu_page *root)
 {
-       return __kvm_tdp_mmu_zap_gfn_range(kvm, start, end, true);
+       if (root->role.invalid)
+               return false;
+
+       return refcount_inc_not_zero(&root->tdp_mmu_root_count);
+}
+
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+                         bool shared);
+
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
+                                gfn_t end, bool can_yield, bool flush,
+                                bool shared);
+static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
+                                            gfn_t start, gfn_t end, bool flush,
+                                            bool shared)
+{
+       return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush,
+                                          shared);
 }
 static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
@@ -29,23 +43,23 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
         * of the shadow page's gfn range and stop iterating before yielding.
         */
        lockdep_assert_held_write(&kvm->mmu_lock);
-       return __kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, end, false);
+       return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp),
+                                          sp->gfn, end, false, false, false);
 }
+
 void kvm_tdp_mmu_zap_all(struct kvm *kvm);
+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
 
 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
                    int map_writable, int max_level, kvm_pfn_t pfn,
                    bool prefault);
 
-int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
-                             unsigned long end);
-
-int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
-                             unsigned long end);
-int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva);
-
-int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
-                            pte_t *host_ptep);
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+                                bool flush);
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 
 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
                             int min_level);
@@ -55,8 +69,9 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
                                       struct kvm_memory_slot *slot,
                                       gfn_t gfn, unsigned long mask,
                                       bool wrprot);
-void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
-                                      struct kvm_memory_slot *slot);
+bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+                                      const struct kvm_memory_slot *slot,
+                                      bool flush);
 
 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
                                   struct kvm_memory_slot *slot, gfn_t gfn);
index 78bdcfa..cd0285f 100644 (file)
@@ -270,7 +270,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
        if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
                return -EINVAL;
 
-       if (!svm->vcpu.arch.apic->regs)
+       if (!vcpu->arch.apic->regs)
                return -EINVAL;
 
        if (kvm_apicv_activated(vcpu->kvm)) {
@@ -281,7 +281,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
                        return ret;
        }
 
-       svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
+       svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
 
        /* Setting AVIC backing page address in the phy APIC ID table */
        entry = avic_get_physical_id_entry(vcpu, id);
@@ -315,15 +315,16 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
        }
 }
 
-int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
        u32 icrl = svm->vmcb->control.exit_info_1;
        u32 id = svm->vmcb->control.exit_info_2 >> 32;
        u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
-       struct kvm_lapic *apic = svm->vcpu.arch.apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
 
-       trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
+       trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
 
        switch (id) {
        case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
@@ -347,11 +348,11 @@ int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
                 * set the appropriate IRR bits on the valid target
                 * vcpus. So, we just need to kick the appropriate vcpu.
                 */
-               avic_kick_target_vcpus(svm->vcpu.kvm, apic, icrl, icrh);
+               avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh);
                break;
        case AVIC_IPI_FAILURE_INVALID_TARGET:
                WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
-                         index, svm->vcpu.vcpu_id, icrh, icrl);
+                         index, vcpu->vcpu_id, icrh, icrl);
                break;
        case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
                WARN_ONCE(1, "Invalid backing page\n");
@@ -539,8 +540,9 @@ static bool is_avic_unaccelerated_access_trap(u32 offset)
        return ret;
 }
 
-int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        int ret = 0;
        u32 offset = svm->vmcb->control.exit_info_1 &
                     AVIC_UNACCEL_ACCESS_OFFSET_MASK;
@@ -550,7 +552,7 @@ int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
                     AVIC_UNACCEL_ACCESS_WRITE_MASK;
        bool trap = is_avic_unaccelerated_access_trap(offset);
 
-       trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
+       trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
                                            trap, write, vector);
        if (trap) {
                /* Handling Trap */
@@ -558,7 +560,7 @@ int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
                ret = avic_unaccel_trap_write(svm);
        } else {
                /* Handling Fault */
-               ret = kvm_emulate_instruction(&svm->vcpu, 0);
+               ret = kvm_emulate_instruction(vcpu, 0);
        }
 
        return ret;
@@ -572,7 +574,7 @@ int avic_init_vcpu(struct vcpu_svm *svm)
        if (!avic || !irqchip_in_kernel(vcpu->kvm))
                return 0;
 
-       ret = avic_init_backing_page(&svm->vcpu);
+       ret = avic_init_backing_page(vcpu);
        if (ret)
                return ret;
 
index fb204ea..540d43b 100644 (file)
@@ -29,6 +29,8 @@
 #include "lapic.h"
 #include "svm.h"
 
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
+
 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
                                       struct x86_exception *fault)
 {
@@ -92,12 +94,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb *hsave = svm->nested.hsave;
 
        WARN_ON(mmu_is_nested(vcpu));
 
        vcpu->arch.mmu = &vcpu->arch.guest_mmu;
-       kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer,
+       kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
+                               svm->vmcb01.ptr->save.efer,
                                svm->nested.ctl.nested_cr3);
        vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
        vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
@@ -123,7 +125,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
                return;
 
        c = &svm->vmcb->control;
-       h = &svm->nested.hsave->control;
+       h = &svm->vmcb01.ptr->control;
        g = &svm->nested.ctl;
 
        for (i = 0; i < MAX_INTERCEPT; i++)
@@ -213,44 +215,64 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
        return true;
 }
 
-static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+/*
+ * Bits 11:0 of bitmap address are ignored by hardware
+ */
+static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size)
 {
-       struct vcpu_svm *svm = to_svm(vcpu);
+       u64 addr = PAGE_ALIGN(pa);
 
-       if (WARN_ON(!is_guest_mode(vcpu)))
-               return true;
-
-       if (!nested_svm_vmrun_msrpm(svm)) {
-               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-               vcpu->run->internal.suberror =
-                       KVM_INTERNAL_ERROR_EMULATION;
-               vcpu->run->internal.ndata = 0;
-               return false;
-       }
-
-       return true;
+       return kvm_vcpu_is_legal_gpa(vcpu, addr) &&
+           kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1);
 }
 
-static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
+static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
+                                      struct vmcb_control_area *control)
 {
-       if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0)
+       if (CC(!vmcb_is_intercept(control, INTERCEPT_VMRUN)))
                return false;
 
-       if (control->asid == 0)
+       if (CC(control->asid == 0))
                return false;
 
-       if ((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
-           !npt_enabled)
+       if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled))
+               return false;
+
+       if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa,
+                                          MSRPM_SIZE)))
+               return false;
+       if (CC(!nested_svm_check_bitmap_pa(vcpu, control->iopm_base_pa,
+                                          IOPM_SIZE)))
                return false;
 
        return true;
 }
 
-static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu,
+                                     struct vmcb_save_area *save)
 {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
-       bool vmcb12_lma;
+       /*
+        * These checks are also performed by KVM_SET_SREGS,
+        * except that EFER.LMA is not checked by SVM against
+        * CR0.PG && EFER.LME.
+        */
+       if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) {
+               if (CC(!(save->cr4 & X86_CR4_PAE)) ||
+                   CC(!(save->cr0 & X86_CR0_PE)) ||
+                   CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3)))
+                       return false;
+       }
+
+       if (CC(!kvm_is_valid_cr4(vcpu, save->cr4)))
+               return false;
+
+       return true;
+}
 
+/* Common checks that apply to both L1 and L2 state.  */
+static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu,
+                                   struct vmcb_save_area *save)
+{
        /*
         * FIXME: these should be done after copying the fields,
         * to avoid TOC/TOU races.  For these save area checks
@@ -258,31 +280,27 @@ static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
         * kvm_set_cr4 handle failure; EFER_SVME is an exception
         * so it is force-set later in nested_prepare_vmcb_save.
         */
-       if ((vmcb12->save.efer & EFER_SVME) == 0)
+       if (CC(!(save->efer & EFER_SVME)))
                return false;
 
-       if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW))
+       if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
+           CC(save->cr0 & ~0xffffffffULL))
                return false;
 
-       if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7))
+       if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
                return false;
 
-       vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG);
+       if (!nested_vmcb_check_cr3_cr4(vcpu, save))
+               return false;
 
-       if (vmcb12_lma) {
-               if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
-                   !(vmcb12->save.cr0 & X86_CR0_PE) ||
-                   kvm_vcpu_is_illegal_gpa(vcpu, vmcb12->save.cr3))
-                       return false;
-       }
-       if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
+       if (CC(!kvm_valid_efer(vcpu, save->efer)))
                return false;
 
        return true;
 }
 
-static void load_nested_vmcb_control(struct vcpu_svm *svm,
-                                    struct vmcb_control_area *control)
+static void nested_load_control_from_vmcb12(struct vcpu_svm *svm,
+                                           struct vmcb_control_area *control)
 {
        copy_vmcb_control_area(&svm->nested.ctl, control);
 
@@ -294,9 +312,9 @@ static void load_nested_vmcb_control(struct vcpu_svm *svm,
 
 /*
  * Synchronize fields that are written by the processor, so that
- * they can be copied back into the nested_vmcb.
+ * they can be copied back into the vmcb12.
  */
-void sync_nested_vmcb_control(struct vcpu_svm *svm)
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
 {
        u32 mask;
        svm->nested.ctl.event_inj      = svm->vmcb->control.event_inj;
@@ -324,8 +342,8 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm)
  * Transfer any event that L0 or L1 wanted to inject into L2 to
  * EXIT_INT_INFO.
  */
-static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
-                                          struct vmcb *vmcb12)
+static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
+                                               struct vmcb *vmcb12)
 {
        struct kvm_vcpu *vcpu = &svm->vcpu;
        u32 exit_int_info = 0;
@@ -369,12 +387,12 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
 static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
                               bool nested_npt)
 {
-       if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
+       if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3)))
                return -EINVAL;
 
        if (!nested_npt && is_pae_paging(vcpu) &&
            (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) {
-               if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+               if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)))
                        return -EINVAL;
        }
 
@@ -393,15 +411,42 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
        return 0;
 }
 
-static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
 {
+       if (!svm->nested.vmcb02.ptr)
+               return;
+
+       /* FIXME: merge g_pat from vmcb01 and vmcb12.  */
+       svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat;
+}
+
+static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+{
+       bool new_vmcb12 = false;
+
+       nested_vmcb02_compute_g_pat(svm);
+
        /* Load the nested guest state */
-       svm->vmcb->save.es = vmcb12->save.es;
-       svm->vmcb->save.cs = vmcb12->save.cs;
-       svm->vmcb->save.ss = vmcb12->save.ss;
-       svm->vmcb->save.ds = vmcb12->save.ds;
-       svm->vmcb->save.gdtr = vmcb12->save.gdtr;
-       svm->vmcb->save.idtr = vmcb12->save.idtr;
+       if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
+               new_vmcb12 = true;
+               svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
+       }
+
+       if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
+               svm->vmcb->save.es = vmcb12->save.es;
+               svm->vmcb->save.cs = vmcb12->save.cs;
+               svm->vmcb->save.ss = vmcb12->save.ss;
+               svm->vmcb->save.ds = vmcb12->save.ds;
+               svm->vmcb->save.cpl = vmcb12->save.cpl;
+               vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
+       }
+
+       if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
+               svm->vmcb->save.gdtr = vmcb12->save.gdtr;
+               svm->vmcb->save.idtr = vmcb12->save.idtr;
+               vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+       }
+
        kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
 
        /*
@@ -413,7 +458,9 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
 
        svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
        svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
-       svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
+       svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
        kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
        kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
        kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
@@ -422,15 +469,41 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
        svm->vmcb->save.rax = vmcb12->save.rax;
        svm->vmcb->save.rsp = vmcb12->save.rsp;
        svm->vmcb->save.rip = vmcb12->save.rip;
-       svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
-       svm->vcpu.arch.dr6  = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
-       svm->vmcb->save.cpl = vmcb12->save.cpl;
+
+       /* These bits will be set properly on the first execution when new_vmc12 is true */
+       if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
+               svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
+               svm->vcpu.arch.dr6  = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
+               vmcb_mark_dirty(svm->vmcb, VMCB_DR);
+       }
 }
 
-static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
+static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
 {
        const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
 
+       /*
+        * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
+        * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
+        */
+
+       /*
+        * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id,
+        * avic_physical_id.
+        */
+       WARN_ON(svm->vmcb01.ptr->control.int_ctl & AVIC_ENABLE_MASK);
+
+       /* Copied from vmcb01.  msrpm_base can be overwritten later.  */
+       svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl;
+       svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa;
+       svm->vmcb->control.msrpm_base_pa = svm->vmcb01.ptr->control.msrpm_base_pa;
+
+       /* Done at vmrun: asid.  */
+
+       /* Also overwritten later if necessary.  */
+       svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+       /* nested_cr3.  */
        if (nested_npt_enabled(svm))
                nested_svm_init_mmu_context(&svm->vcpu);
 
@@ -439,7 +512,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
 
        svm->vmcb->control.int_ctl             =
                (svm->nested.ctl.int_ctl & ~mask) |
-               (svm->nested.hsave->control.int_ctl & mask);
+               (svm->vmcb01.ptr->control.int_ctl & mask);
 
        svm->vmcb->control.virt_ext            = svm->nested.ctl.virt_ext;
        svm->vmcb->control.int_vector          = svm->nested.ctl.int_vector;
@@ -454,17 +527,28 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
        enter_guest_mode(&svm->vcpu);
 
        /*
-        * Merge guest and host intercepts - must be called  with vcpu in
-        * guest-mode to take affect here
+        * Merge guest and host intercepts - must be called with vcpu in
+        * guest-mode to take effect.
         */
        recalc_intercepts(svm);
+}
 
-       vmcb_mark_all_dirty(svm->vmcb);
+static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+       /*
+        * Some VMCB state is shared between L1 and L2 and thus has to be
+        * moved at the time of nested vmrun and vmexit.
+        *
+        * VMLOAD/VMSAVE state would also belong in this category, but KVM
+        * always performs VMLOAD and VMSAVE from the VMCB01.
+        */
+       to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl;
 }
 
-int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
                         struct vmcb *vmcb12)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        int ret;
 
        trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
@@ -482,8 +566,14 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
 
 
        svm->nested.vmcb12_gpa = vmcb12_gpa;
-       nested_prepare_vmcb_control(svm);
-       nested_prepare_vmcb_save(svm, vmcb12);
+
+       WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr);
+
+       nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
+
+       svm_switch_vmcb(svm, &svm->nested.vmcb02);
+       nested_vmcb02_prepare_control(svm);
+       nested_vmcb02_prepare_save(svm, vmcb12);
 
        ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
                                  nested_npt_enabled(svm));
@@ -491,47 +581,48 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
                return ret;
 
        if (!npt_enabled)
-               svm->vcpu.arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
+               vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
 
        svm_set_gif(svm, true);
 
        return 0;
 }
 
-int nested_svm_vmrun(struct vcpu_svm *svm)
+int nested_svm_vmrun(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        int ret;
        struct vmcb *vmcb12;
-       struct vmcb *hsave = svm->nested.hsave;
-       struct vmcb *vmcb = svm->vmcb;
        struct kvm_host_map map;
        u64 vmcb12_gpa;
 
-       if (is_smm(&svm->vcpu)) {
-               kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       ++vcpu->stat.nested_run;
+
+       if (is_smm(vcpu)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
        }
 
        vmcb12_gpa = svm->vmcb->save.rax;
-       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map);
+       ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map);
        if (ret == -EINVAL) {
-               kvm_inject_gp(&svm->vcpu, 0);
+               kvm_inject_gp(vcpu, 0);
                return 1;
        } else if (ret) {
-               return kvm_skip_emulated_instruction(&svm->vcpu);
+               return kvm_skip_emulated_instruction(vcpu);
        }
 
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
+       ret = kvm_skip_emulated_instruction(vcpu);
 
        vmcb12 = map.hva;
 
        if (WARN_ON_ONCE(!svm->nested.initialized))
                return -EINVAL;
 
-       load_nested_vmcb_control(svm, &vmcb12->control);
+       nested_load_control_from_vmcb12(svm, &vmcb12->control);
 
-       if (!nested_vmcb_check_save(svm, vmcb12) ||
-           !nested_vmcb_check_controls(&svm->nested.ctl)) {
+       if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) ||
+           !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) {
                vmcb12->control.exit_code    = SVM_EXIT_ERR;
                vmcb12->control.exit_code_hi = 0;
                vmcb12->control.exit_info_1  = 0;
@@ -541,36 +632,25 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
 
 
        /* Clear internal status */
-       kvm_clear_exception_queue(&svm->vcpu);
-       kvm_clear_interrupt_queue(&svm->vcpu);
+       kvm_clear_exception_queue(vcpu);
+       kvm_clear_interrupt_queue(vcpu);
 
        /*
-        * Save the old vmcb, so we don't need to pick what we save, but can
-        * restore everything when a VMEXIT occurs
+        * Since vmcb01 is not in use, we can use it to store some of the L1
+        * state.
         */
-       hsave->save.es     = vmcb->save.es;
-       hsave->save.cs     = vmcb->save.cs;
-       hsave->save.ss     = vmcb->save.ss;
-       hsave->save.ds     = vmcb->save.ds;
-       hsave->save.gdtr   = vmcb->save.gdtr;
-       hsave->save.idtr   = vmcb->save.idtr;
-       hsave->save.efer   = svm->vcpu.arch.efer;
-       hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
-       hsave->save.cr4    = svm->vcpu.arch.cr4;
-       hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
-       hsave->save.rip    = kvm_rip_read(&svm->vcpu);
-       hsave->save.rsp    = vmcb->save.rsp;
-       hsave->save.rax    = vmcb->save.rax;
-       if (npt_enabled)
-               hsave->save.cr3    = vmcb->save.cr3;
-       else
-               hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);
-
-       copy_vmcb_control_area(&hsave->control, &vmcb->control);
+       svm->vmcb01.ptr->save.efer   = vcpu->arch.efer;
+       svm->vmcb01.ptr->save.cr0    = kvm_read_cr0(vcpu);
+       svm->vmcb01.ptr->save.cr4    = vcpu->arch.cr4;
+       svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu);
+       svm->vmcb01.ptr->save.rip    = kvm_rip_read(vcpu);
+
+       if (!npt_enabled)
+               svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu);
 
        svm->nested.nested_run_pending = 1;
 
-       if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12))
+       if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12))
                goto out_exit_err;
 
        if (nested_svm_vmrun_msrpm(svm))
@@ -587,7 +667,7 @@ out_exit_err:
        nested_svm_vmexit(svm);
 
 out:
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       kvm_vcpu_unmap(vcpu, &map, true);
 
        return ret;
 }
@@ -610,27 +690,30 @@ void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
 
 int nested_svm_vmexit(struct vcpu_svm *svm)
 {
-       int rc;
+       struct kvm_vcpu *vcpu = &svm->vcpu;
        struct vmcb *vmcb12;
-       struct vmcb *hsave = svm->nested.hsave;
        struct vmcb *vmcb = svm->vmcb;
        struct kvm_host_map map;
+       int rc;
 
-       rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
+       /* Triple faults in L2 should never escape. */
+       WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
+
+       rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
        if (rc) {
                if (rc == -EINVAL)
-                       kvm_inject_gp(&svm->vcpu, 0);
+                       kvm_inject_gp(vcpu, 0);
                return 1;
        }
 
        vmcb12 = map.hva;
 
        /* Exit Guest-Mode */
-       leave_guest_mode(&svm->vcpu);
+       leave_guest_mode(vcpu);
        svm->nested.vmcb12_gpa = 0;
        WARN_ON_ONCE(svm->nested.nested_run_pending);
 
-       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
+       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
 
        /* in case we halted in L2 */
        svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -644,14 +727,14 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
        vmcb12->save.gdtr   = vmcb->save.gdtr;
        vmcb12->save.idtr   = vmcb->save.idtr;
        vmcb12->save.efer   = svm->vcpu.arch.efer;
-       vmcb12->save.cr0    = kvm_read_cr0(&svm->vcpu);
-       vmcb12->save.cr3    = kvm_read_cr3(&svm->vcpu);
+       vmcb12->save.cr0    = kvm_read_cr0(vcpu);
+       vmcb12->save.cr3    = kvm_read_cr3(vcpu);
        vmcb12->save.cr2    = vmcb->save.cr2;
        vmcb12->save.cr4    = svm->vcpu.arch.cr4;
-       vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu);
-       vmcb12->save.rip    = kvm_rip_read(&svm->vcpu);
-       vmcb12->save.rsp    = kvm_rsp_read(&svm->vcpu);
-       vmcb12->save.rax    = kvm_rax_read(&svm->vcpu);
+       vmcb12->save.rflags = kvm_get_rflags(vcpu);
+       vmcb12->save.rip    = kvm_rip_read(vcpu);
+       vmcb12->save.rsp    = kvm_rsp_read(vcpu);
+       vmcb12->save.rax    = kvm_rax_read(vcpu);
        vmcb12->save.dr7    = vmcb->save.dr7;
        vmcb12->save.dr6    = svm->vcpu.arch.dr6;
        vmcb12->save.cpl    = vmcb->save.cpl;
@@ -663,7 +746,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
        vmcb12->control.exit_info_2       = vmcb->control.exit_info_2;
 
        if (vmcb12->control.exit_code != SVM_EXIT_ERR)
-               nested_vmcb_save_pending_event(svm, vmcb12);
+               nested_save_pending_event_to_vmcb12(svm, vmcb12);
 
        if (svm->nrips_enabled)
                vmcb12->control.next_rip  = vmcb->control.next_rip;
@@ -678,37 +761,39 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
        vmcb12->control.pause_filter_thresh =
                svm->vmcb->control.pause_filter_thresh;
 
-       /* Restore the original control entries */
-       copy_vmcb_control_area(&vmcb->control, &hsave->control);
+       nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
+
+       svm_switch_vmcb(svm, &svm->vmcb01);
+       WARN_ON_ONCE(svm->vmcb->control.exit_code != SVM_EXIT_VMRUN);
 
-       /* On vmexit the  GIF is set to false */
+       /*
+        * On vmexit the  GIF is set to false and
+        * no event can be injected in L1.
+        */
        svm_set_gif(svm, false);
+       svm->vmcb->control.exit_int_info = 0;
 
-       svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
-               svm->vcpu.arch.l1_tsc_offset;
+       svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
+       if (svm->vmcb->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
+               svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+               vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+       }
 
        svm->nested.ctl.nested_cr3 = 0;
 
-       /* Restore selected save entries */
-       svm->vmcb->save.es = hsave->save.es;
-       svm->vmcb->save.cs = hsave->save.cs;
-       svm->vmcb->save.ss = hsave->save.ss;
-       svm->vmcb->save.ds = hsave->save.ds;
-       svm->vmcb->save.gdtr = hsave->save.gdtr;
-       svm->vmcb->save.idtr = hsave->save.idtr;
-       kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
-       kvm_set_rflags(&svm->vcpu, hsave->save.rflags | X86_EFLAGS_FIXED);
-       svm_set_efer(&svm->vcpu, hsave->save.efer);
-       svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
-       svm_set_cr4(&svm->vcpu, hsave->save.cr4);
-       kvm_rax_write(&svm->vcpu, hsave->save.rax);
-       kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
-       kvm_rip_write(&svm->vcpu, hsave->save.rip);
-       svm->vmcb->save.dr7 = DR7_FIXED_1;
-       svm->vmcb->save.cpl = 0;
-       svm->vmcb->control.exit_int_info = 0;
+       /*
+        * Restore processor state that had been saved in vmcb01
+        */
+       kvm_set_rflags(vcpu, svm->vmcb->save.rflags);
+       svm_set_efer(vcpu, svm->vmcb->save.efer);
+       svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
+       svm_set_cr4(vcpu, svm->vmcb->save.cr4);
+       kvm_rax_write(vcpu, svm->vmcb->save.rax);
+       kvm_rsp_write(vcpu, svm->vmcb->save.rsp);
+       kvm_rip_write(vcpu, svm->vmcb->save.rip);
 
-       vmcb_mark_all_dirty(svm->vmcb);
+       svm->vcpu.arch.dr7 = DR7_FIXED_1;
+       kvm_update_dr7(&svm->vcpu);
 
        trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
                                       vmcb12->control.exit_info_1,
@@ -717,50 +802,62 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
                                       vmcb12->control.exit_int_info_err,
                                       KVM_ISA_SVM);
 
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       kvm_vcpu_unmap(vcpu, &map, true);
 
-       nested_svm_uninit_mmu_context(&svm->vcpu);
+       nested_svm_uninit_mmu_context(vcpu);
 
-       rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
+       rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false);
        if (rc)
                return 1;
 
-       if (npt_enabled)
-               svm->vmcb->save.cr3 = hsave->save.cr3;
-
        /*
         * Drop what we picked up for L2 via svm_complete_interrupts() so it
         * doesn't end up in L1.
         */
        svm->vcpu.arch.nmi_injected = false;
-       kvm_clear_exception_queue(&svm->vcpu);
-       kvm_clear_interrupt_queue(&svm->vcpu);
+       kvm_clear_exception_queue(vcpu);
+       kvm_clear_interrupt_queue(vcpu);
+
+       /*
+        * If we are here following the completion of a VMRUN that
+        * is being single-stepped, queue the pending #DB intercept
+        * right now so that it an be accounted for before we execute
+        * L1's next instruction.
+        */
+       if (unlikely(svm->vmcb->save.rflags & X86_EFLAGS_TF))
+               kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
 
        return 0;
 }
 
+static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
+{
+       nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN);
+}
+
 int svm_allocate_nested(struct vcpu_svm *svm)
 {
-       struct page *hsave_page;
+       struct page *vmcb02_page;
 
        if (svm->nested.initialized)
                return 0;
 
-       hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
-       if (!hsave_page)
+       vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+       if (!vmcb02_page)
                return -ENOMEM;
-       svm->nested.hsave = page_address(hsave_page);
+       svm->nested.vmcb02.ptr = page_address(vmcb02_page);
+       svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
 
        svm->nested.msrpm = svm_vcpu_alloc_msrpm();
        if (!svm->nested.msrpm)
-               goto err_free_hsave;
+               goto err_free_vmcb02;
        svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);
 
        svm->nested.initialized = true;
        return 0;
 
-err_free_hsave:
-       __free_page(hsave_page);
+err_free_vmcb02:
+       __free_page(vmcb02_page);
        return -ENOMEM;
 }
 
@@ -772,8 +869,8 @@ void svm_free_nested(struct vcpu_svm *svm)
        svm_vcpu_free_msrpm(svm->nested.msrpm);
        svm->nested.msrpm = NULL;
 
-       __free_page(virt_to_page(svm->nested.hsave));
-       svm->nested.hsave = NULL;
+       __free_page(virt_to_page(svm->nested.vmcb02.ptr));
+       svm->nested.vmcb02.ptr = NULL;
 
        svm->nested.initialized = false;
 }
@@ -783,18 +880,19 @@ void svm_free_nested(struct vcpu_svm *svm)
  */
 void svm_leave_nested(struct vcpu_svm *svm)
 {
-       if (is_guest_mode(&svm->vcpu)) {
-               struct vmcb *hsave = svm->nested.hsave;
-               struct vmcb *vmcb = svm->vmcb;
+       struct kvm_vcpu *vcpu = &svm->vcpu;
 
+       if (is_guest_mode(vcpu)) {
                svm->nested.nested_run_pending = 0;
-               leave_guest_mode(&svm->vcpu);
-               copy_vmcb_control_area(&vmcb->control, &hsave->control);
-               nested_svm_uninit_mmu_context(&svm->vcpu);
+               leave_guest_mode(vcpu);
+
+               svm_switch_vmcb(svm, &svm->nested.vmcb02);
+
+               nested_svm_uninit_mmu_context(vcpu);
                vmcb_mark_all_dirty(svm->vmcb);
        }
 
-       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
+       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
 }
 
 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
@@ -903,16 +1001,15 @@ int nested_svm_exit_handled(struct vcpu_svm *svm)
        return vmexit;
 }
 
-int nested_svm_check_permissions(struct vcpu_svm *svm)
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
 {
-       if (!(svm->vcpu.arch.efer & EFER_SVME) ||
-           !is_paging(&svm->vcpu)) {
-               kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
        }
 
-       if (svm->vmcb->save.cpl) {
-               kvm_inject_gp(&svm->vcpu, 0);
+       if (to_svm(vcpu)->vmcb->save.cpl) {
+               kvm_inject_gp(vcpu, 0);
                return 1;
        }
 
@@ -960,50 +1057,11 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
        nested_svm_vmexit(svm);
 }
 
-static void nested_svm_smi(struct vcpu_svm *svm)
-{
-       svm->vmcb->control.exit_code = SVM_EXIT_SMI;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
-
-       nested_svm_vmexit(svm);
-}
-
-static void nested_svm_nmi(struct vcpu_svm *svm)
-{
-       svm->vmcb->control.exit_code = SVM_EXIT_NMI;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
-
-       nested_svm_vmexit(svm);
-}
-
-static void nested_svm_intr(struct vcpu_svm *svm)
-{
-       trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
-
-       svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
-
-       nested_svm_vmexit(svm);
-}
-
 static inline bool nested_exit_on_init(struct vcpu_svm *svm)
 {
        return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
 }
 
-static void nested_svm_init(struct vcpu_svm *svm)
-{
-       svm->vmcb->control.exit_code   = SVM_EXIT_INIT;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
-
-       nested_svm_vmexit(svm);
-}
-
-
 static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -1017,12 +1075,18 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                        return -EBUSY;
                if (!nested_exit_on_init(svm))
                        return 0;
-               nested_svm_init(svm);
+               nested_svm_simple_vmexit(svm, SVM_EXIT_INIT);
                return 0;
        }
 
        if (vcpu->arch.exception.pending) {
-               if (block_nested_events)
+               /*
+                * Only a pending nested run can block a pending exception.
+                * Otherwise an injected NMI/interrupt should either be
+                * lost or delivered to the nested hypervisor in the EXITINTINFO
+                * vmcb field, while delivering the pending exception.
+                */
+               if (svm->nested.nested_run_pending)
                         return -EBUSY;
                if (!nested_exit_on_exception(svm))
                        return 0;
@@ -1035,7 +1099,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                        return -EBUSY;
                if (!nested_exit_on_smi(svm))
                        return 0;
-               nested_svm_smi(svm);
+               nested_svm_simple_vmexit(svm, SVM_EXIT_SMI);
                return 0;
        }
 
@@ -1044,7 +1108,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                        return -EBUSY;
                if (!nested_exit_on_nmi(svm))
                        return 0;
-               nested_svm_nmi(svm);
+               nested_svm_simple_vmexit(svm, SVM_EXIT_NMI);
                return 0;
        }
 
@@ -1053,7 +1117,8 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                        return -EBUSY;
                if (!nested_exit_on_intr(svm))
                        return 0;
-               nested_svm_intr(svm);
+               trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+               nested_svm_simple_vmexit(svm, SVM_EXIT_INTR);
                return 0;
        }
 
@@ -1072,8 +1137,8 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
        case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
                u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
 
-               if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] &
-                               excp_bits)
+               if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] &
+                   excp_bits)
                        return NESTED_EXIT_HOST;
                else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
                         svm->vcpu.arch.apf.host_apf_flags)
@@ -1137,10 +1202,9 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu,
        if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
                         sizeof(user_vmcb->control)))
                return -EFAULT;
-       if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save,
+       if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save,
                         sizeof(user_vmcb->save)))
                return -EFAULT;
-
 out:
        return kvm_state.size;
 }
@@ -1150,7 +1214,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
                                struct kvm_nested_state *kvm_state)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb *hsave = svm->nested.hsave;
        struct vmcb __user *user_vmcb = (struct vmcb __user *)
                &user_kvm_nested_state->data.svm[0];
        struct vmcb_control_area *ctl;
@@ -1195,8 +1258,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
                return -EINVAL;
 
        ret  = -ENOMEM;
-       ctl  = kzalloc(sizeof(*ctl),  GFP_KERNEL);
-       save = kzalloc(sizeof(*save), GFP_KERNEL);
+       ctl  = kzalloc(sizeof(*ctl),  GFP_KERNEL_ACCOUNT);
+       save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT);
        if (!ctl || !save)
                goto out_free;
 
@@ -1207,12 +1270,12 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
                goto out_free;
 
        ret = -EINVAL;
-       if (!nested_vmcb_check_controls(ctl))
+       if (!nested_vmcb_check_controls(vcpu, ctl))
                goto out_free;
 
        /*
         * Processor state contains L2 state.  Check that it is
-        * valid for guest mode (see nested_vmcb_checks).
+        * valid for guest mode (see nested_vmcb_check_save).
         */
        cr0 = kvm_read_cr0(vcpu);
         if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
@@ -1221,29 +1284,48 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
        /*
         * Validate host state saved from before VMRUN (see
         * nested_svm_check_permissions).
-        * TODO: validate reserved bits for all saved state.
         */
-       if (!(save->cr0 & X86_CR0_PG))
-               goto out_free;
-       if (!(save->efer & EFER_SVME))
+       if (!(save->cr0 & X86_CR0_PG) ||
+           !(save->cr0 & X86_CR0_PE) ||
+           (save->rflags & X86_EFLAGS_VM) ||
+           !nested_vmcb_valid_sregs(vcpu, save))
                goto out_free;
 
        /*
-        * All checks done, we can enter guest mode.  L1 control fields
-        * come from the nested save state.  Guest state is already
-        * in the registers, the save area of the nested state instead
-        * contains saved L1 state.
+        * All checks done, we can enter guest mode. Userspace provides
+        * vmcb12.control, which will be combined with L1 and stored into
+        * vmcb02, and the L1 save state which we store in vmcb01.
+        * L2 registers if needed are moved from the current VMCB to VMCB02.
         */
 
        svm->nested.nested_run_pending =
                !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
 
-       copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
-       hsave->save = *save;
-
        svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
-       load_nested_vmcb_control(svm, ctl);
-       nested_prepare_vmcb_control(svm);
+       if (svm->current_vmcb == &svm->vmcb01)
+               svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
+
+       svm->vmcb01.ptr->save.es = save->es;
+       svm->vmcb01.ptr->save.cs = save->cs;
+       svm->vmcb01.ptr->save.ss = save->ss;
+       svm->vmcb01.ptr->save.ds = save->ds;
+       svm->vmcb01.ptr->save.gdtr = save->gdtr;
+       svm->vmcb01.ptr->save.idtr = save->idtr;
+       svm->vmcb01.ptr->save.rflags = save->rflags | X86_EFLAGS_FIXED;
+       svm->vmcb01.ptr->save.efer = save->efer;
+       svm->vmcb01.ptr->save.cr0 = save->cr0;
+       svm->vmcb01.ptr->save.cr3 = save->cr3;
+       svm->vmcb01.ptr->save.cr4 = save->cr4;
+       svm->vmcb01.ptr->save.rax = save->rax;
+       svm->vmcb01.ptr->save.rsp = save->rsp;
+       svm->vmcb01.ptr->save.rip = save->rip;
+       svm->vmcb01.ptr->save.cpl = 0;
+
+       nested_load_control_from_vmcb12(svm, ctl);
+
+       svm_switch_vmcb(svm, &svm->nested.vmcb02);
+
+       nested_vmcb02_prepare_control(svm);
 
        kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
        ret = 0;
@@ -1254,8 +1336,31 @@ out_free:
        return ret;
 }
 
+static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (WARN_ON(!is_guest_mode(vcpu)))
+               return true;
+
+       if (nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
+                               nested_npt_enabled(svm)))
+               return false;
+
+       if (!nested_svm_vmrun_msrpm(svm)) {
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->internal.suberror =
+                       KVM_INTERNAL_ERROR_EMULATION;
+               vcpu->run->internal.ndata = 0;
+               return false;
+       }
+
+       return true;
+}
+
 struct kvm_x86_nested_ops svm_nested_ops = {
        .check_events = svm_check_nested_events,
+       .triple_fault = nested_svm_triple_fault,
        .get_nested_state_pages = svm_get_nested_state_pages,
        .get_state = svm_get_nested_state,
        .set_state = svm_set_nested_state,
index 214eefb..2632852 100644 (file)
@@ -50,6 +50,7 @@ static DECLARE_RWSEM(sev_deactivate_lock);
 static DEFINE_MUTEX(sev_bitmap_lock);
 unsigned int max_sev_asid;
 static unsigned int min_sev_asid;
+static unsigned long sev_me_mask;
 static unsigned long *sev_asid_bitmap;
 static unsigned long *sev_reclaim_asid_bitmap;
 
@@ -82,6 +83,11 @@ static int sev_flush_asids(void)
        return ret;
 }
 
+static inline bool is_mirroring_enc_context(struct kvm *kvm)
+{
+       return !!to_kvm_svm(kvm)->sev_info.enc_context_owner;
+}
+
 /* Must be called with the sev_bitmap_lock held */
 static bool __sev_recycle_asids(int min_asid, int max_asid)
 {
@@ -184,49 +190,41 @@ static void sev_asid_free(struct kvm_sev_info *sev)
 
 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
 {
-       struct sev_data_decommission *decommission;
-       struct sev_data_deactivate *data;
+       struct sev_data_decommission decommission;
+       struct sev_data_deactivate deactivate;
 
        if (!handle)
                return;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               return;
-
-       /* deactivate handle */
-       data->handle = handle;
+       deactivate.handle = handle;
 
        /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
        down_read(&sev_deactivate_lock);
-       sev_guest_deactivate(data, NULL);
+       sev_guest_deactivate(&deactivate, NULL);
        up_read(&sev_deactivate_lock);
 
-       kfree(data);
-
-       decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
-       if (!decommission)
-               return;
-
        /* decommission handle */
-       decommission->handle = handle;
-       sev_guest_decommission(decommission, NULL);
-
-       kfree(decommission);
+       decommission.handle = handle;
+       sev_guest_decommission(&decommission, NULL);
 }
 
 static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       bool es_active = argp->id == KVM_SEV_ES_INIT;
        int asid, ret;
 
+       if (kvm->created_vcpus)
+               return -EINVAL;
+
        ret = -EBUSY;
        if (unlikely(sev->active))
                return ret;
 
+       sev->es_active = es_active;
        asid = sev_asid_new(sev);
        if (asid < 0)
-               return ret;
+               goto e_no_asid;
        sev->asid = asid;
 
        ret = sev_platform_init(&argp->error);
@@ -234,6 +232,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
                goto e_free;
 
        sev->active = true;
+       sev->asid = asid;
        INIT_LIST_HEAD(&sev->regions_list);
 
        return 0;
@@ -241,34 +240,21 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
 e_free:
        sev_asid_free(sev);
        sev->asid = 0;
+e_no_asid:
+       sev->es_active = false;
        return ret;
 }
 
-static int sev_es_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
-{
-       if (!sev_es)
-               return -ENOTTY;
-
-       to_kvm_svm(kvm)->sev_info.es_active = true;
-
-       return sev_guest_init(kvm, argp);
-}
-
 static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
 {
-       struct sev_data_activate *data;
+       struct sev_data_activate activate;
        int asid = sev_get_asid(kvm);
        int ret;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
-
        /* activate ASID on the given handle */
-       data->handle = handle;
-       data->asid   = asid;
-       ret = sev_guest_activate(data, error);
-       kfree(data);
+       activate.handle = handle;
+       activate.asid   = asid;
+       ret = sev_guest_activate(&activate, error);
 
        return ret;
 }
@@ -298,7 +284,7 @@ static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
 static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_launch_start *start;
+       struct sev_data_launch_start start;
        struct kvm_sev_launch_start params;
        void *dh_blob, *session_blob;
        int *error = &argp->error;
@@ -310,20 +296,16 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
        if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
                return -EFAULT;
 
-       start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT);
-       if (!start)
-               return -ENOMEM;
+       memset(&start, 0, sizeof(start));
 
        dh_blob = NULL;
        if (params.dh_uaddr) {
                dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
-               if (IS_ERR(dh_blob)) {
-                       ret = PTR_ERR(dh_blob);
-                       goto e_free;
-               }
+               if (IS_ERR(dh_blob))
+                       return PTR_ERR(dh_blob);
 
-               start->dh_cert_address = __sme_set(__pa(dh_blob));
-               start->dh_cert_len = params.dh_len;
+               start.dh_cert_address = __sme_set(__pa(dh_blob));
+               start.dh_cert_len = params.dh_len;
        }
 
        session_blob = NULL;
@@ -334,40 +316,38 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
                        goto e_free_dh;
                }
 
-               start->session_address = __sme_set(__pa(session_blob));
-               start->session_len = params.session_len;
+               start.session_address = __sme_set(__pa(session_blob));
+               start.session_len = params.session_len;
        }
 
-       start->handle = params.handle;
-       start->policy = params.policy;
+       start.handle = params.handle;
+       start.policy = params.policy;
 
        /* create memory encryption context */
-       ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
+       ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, &start, error);
        if (ret)
                goto e_free_session;
 
        /* Bind ASID to this guest */
-       ret = sev_bind_asid(kvm, start->handle, error);
+       ret = sev_bind_asid(kvm, start.handle, error);
        if (ret)
                goto e_free_session;
 
        /* return handle to userspace */
-       params.handle = start->handle;
+       params.handle = start.handle;
        if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
-               sev_unbind_asid(kvm, start->handle);
+               sev_unbind_asid(kvm, start.handle);
                ret = -EFAULT;
                goto e_free_session;
        }
 
-       sev->handle = start->handle;
+       sev->handle = start.handle;
        sev->fd = argp->sev_fd;
 
 e_free_session:
        kfree(session_blob);
 e_free_dh:
        kfree(dh_blob);
-e_free:
-       kfree(start);
        return ret;
 }
 
@@ -486,7 +466,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
        unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
        struct kvm_sev_launch_update_data params;
-       struct sev_data_launch_update_data *data;
+       struct sev_data_launch_update_data data;
        struct page **inpages;
        int ret;
 
@@ -496,20 +476,14 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
        if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
                return -EFAULT;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
-
        vaddr = params.uaddr;
        size = params.len;
        vaddr_end = vaddr + size;
 
        /* Lock the user memory. */
        inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
-       if (IS_ERR(inpages)) {
-               ret = PTR_ERR(inpages);
-               goto e_free;
-       }
+       if (IS_ERR(inpages))
+               return PTR_ERR(inpages);
 
        /*
         * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in
@@ -517,6 +491,9 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
         */
        sev_clflush_pages(inpages, npages);
 
+       data.reserved = 0;
+       data.handle = sev->handle;
+
        for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
                int offset, len;
 
@@ -531,10 +508,9 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
 
                len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
 
-               data->handle = sev->handle;
-               data->len = len;
-               data->address = __sme_page_pa(inpages[i]) + offset;
-               ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error);
+               data.len = len;
+               data.address = __sme_page_pa(inpages[i]) + offset;
+               ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error);
                if (ret)
                        goto e_unpin;
 
@@ -550,8 +526,6 @@ e_unpin:
        }
        /* unlock the user pages */
        sev_unpin_memory(kvm, inpages, npages);
-e_free:
-       kfree(data);
        return ret;
 }
 
@@ -603,23 +577,22 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
 static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_launch_update_vmsa *vmsa;
+       struct sev_data_launch_update_vmsa vmsa;
+       struct kvm_vcpu *vcpu;
        int i, ret;
 
        if (!sev_es_guest(kvm))
                return -ENOTTY;
 
-       vmsa = kzalloc(sizeof(*vmsa), GFP_KERNEL);
-       if (!vmsa)
-               return -ENOMEM;
+       vmsa.reserved = 0;
 
-       for (i = 0; i < kvm->created_vcpus; i++) {
-               struct vcpu_svm *svm = to_svm(kvm->vcpus[i]);
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               struct vcpu_svm *svm = to_svm(vcpu);
 
                /* Perform some pre-encryption checks against the VMSA */
                ret = sev_es_sync_vmsa(svm);
                if (ret)
-                       goto e_free;
+                       return ret;
 
                /*
                 * The LAUNCH_UPDATE_VMSA command will perform in-place
@@ -629,27 +602,25 @@ static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
                 */
                clflush_cache_range(svm->vmsa, PAGE_SIZE);
 
-               vmsa->handle = sev->handle;
-               vmsa->address = __sme_pa(svm->vmsa);
-               vmsa->len = PAGE_SIZE;
-               ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, vmsa,
+               vmsa.handle = sev->handle;
+               vmsa.address = __sme_pa(svm->vmsa);
+               vmsa.len = PAGE_SIZE;
+               ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa,
                                    &argp->error);
                if (ret)
-                       goto e_free;
+                       return ret;
 
                svm->vcpu.arch.guest_state_protected = true;
        }
 
-e_free:
-       kfree(vmsa);
-       return ret;
+       return 0;
 }
 
 static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        void __user *measure = (void __user *)(uintptr_t)argp->data;
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_launch_measure *data;
+       struct sev_data_launch_measure data;
        struct kvm_sev_launch_measure params;
        void __user *p = NULL;
        void *blob = NULL;
@@ -661,9 +632,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
        if (copy_from_user(&params, measure, sizeof(params)))
                return -EFAULT;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
+       memset(&data, 0, sizeof(data));
 
        /* User wants to query the blob length */
        if (!params.len)
@@ -671,23 +640,20 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
 
        p = (void __user *)(uintptr_t)params.uaddr;
        if (p) {
-               if (params.len > SEV_FW_BLOB_MAX_SIZE) {
-                       ret = -EINVAL;
-                       goto e_free;
-               }
+               if (params.len > SEV_FW_BLOB_MAX_SIZE)
+                       return -EINVAL;
 
-               ret = -ENOMEM;
-               blob = kmalloc(params.len, GFP_KERNEL);
+               blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT);
                if (!blob)
-                       goto e_free;
+                       return -ENOMEM;
 
-               data->address = __psp_pa(blob);
-               data->len = params.len;
+               data.address = __psp_pa(blob);
+               data.len = params.len;
        }
 
 cmd:
-       data->handle = sev->handle;
-       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error);
+       data.handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error);
 
        /*
         * If we query the session length, FW responded with expected data.
@@ -704,63 +670,50 @@ cmd:
        }
 
 done:
-       params.len = data->len;
+       params.len = data.len;
        if (copy_to_user(measure, &params, sizeof(params)))
                ret = -EFAULT;
 e_free_blob:
        kfree(blob);
-e_free:
-       kfree(data);
        return ret;
 }
 
 static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_launch_finish *data;
-       int ret;
+       struct sev_data_launch_finish data;
 
        if (!sev_guest(kvm))
                return -ENOTTY;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
-
-       data->handle = sev->handle;
-       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error);
-
-       kfree(data);
-       return ret;
+       data.handle = sev->handle;
+       return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error);
 }
 
 static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
        struct kvm_sev_guest_status params;
-       struct sev_data_guest_status *data;
+       struct sev_data_guest_status data;
        int ret;
 
        if (!sev_guest(kvm))
                return -ENOTTY;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
+       memset(&data, 0, sizeof(data));
 
-       data->handle = sev->handle;
-       ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error);
+       data.handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error);
        if (ret)
-               goto e_free;
+               return ret;
 
-       params.policy = data->policy;
-       params.state = data->state;
-       params.handle = data->handle;
+       params.policy = data.policy;
+       params.state = data.state;
+       params.handle = data.handle;
 
        if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
                ret = -EFAULT;
-e_free:
-       kfree(data);
+
        return ret;
 }
 
@@ -769,23 +722,17 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
                               int *error, bool enc)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_dbg *data;
-       int ret;
-
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
+       struct sev_data_dbg data;
 
-       data->handle = sev->handle;
-       data->dst_addr = dst;
-       data->src_addr = src;
-       data->len = size;
+       data.reserved = 0;
+       data.handle = sev->handle;
+       data.dst_addr = dst;
+       data.src_addr = src;
+       data.len = size;
 
-       ret = sev_issue_cmd(kvm,
-                           enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
-                           data, error);
-       kfree(data);
-       return ret;
+       return sev_issue_cmd(kvm,
+                            enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
+                            &data, error);
 }
 
 static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
@@ -1005,7 +952,7 @@ err:
 static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_launch_secret *data;
+       struct sev_data_launch_secret data;
        struct kvm_sev_launch_secret params;
        struct page **pages;
        void *blob, *hdr;
@@ -1037,41 +984,36 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
                goto e_unpin_memory;
        }
 
-       ret = -ENOMEM;
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               goto e_unpin_memory;
+       memset(&data, 0, sizeof(data));
 
        offset = params.guest_uaddr & (PAGE_SIZE - 1);
-       data->guest_address = __sme_page_pa(pages[0]) + offset;
-       data->guest_len = params.guest_len;
+       data.guest_address = __sme_page_pa(pages[0]) + offset;
+       data.guest_len = params.guest_len;
 
        blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
        if (IS_ERR(blob)) {
                ret = PTR_ERR(blob);
-               goto e_free;
+               goto e_unpin_memory;
        }
 
-       data->trans_address = __psp_pa(blob);
-       data->trans_len = params.trans_len;
+       data.trans_address = __psp_pa(blob);
+       data.trans_len = params.trans_len;
 
        hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
        if (IS_ERR(hdr)) {
                ret = PTR_ERR(hdr);
                goto e_free_blob;
        }
-       data->hdr_address = __psp_pa(hdr);
-       data->hdr_len = params.hdr_len;
+       data.hdr_address = __psp_pa(hdr);
+       data.hdr_len = params.hdr_len;
 
-       data->handle = sev->handle;
-       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error);
+       data.handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error);
 
        kfree(hdr);
 
 e_free_blob:
        kfree(blob);
-e_free:
-       kfree(data);
 e_unpin_memory:
        /* content of memory is updated, mark pages dirty */
        for (i = 0; i < n; i++) {
@@ -1086,7 +1028,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        void __user *report = (void __user *)(uintptr_t)argp->data;
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_attestation_report *data;
+       struct sev_data_attestation_report data;
        struct kvm_sev_attestation_report params;
        void __user *p;
        void *blob = NULL;
@@ -1098,9 +1040,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
        if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
                return -EFAULT;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
+       memset(&data, 0, sizeof(data));
 
        /* User wants to query the blob length */
        if (!params.len)
@@ -1108,23 +1048,20 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
 
        p = (void __user *)(uintptr_t)params.uaddr;
        if (p) {
-               if (params.len > SEV_FW_BLOB_MAX_SIZE) {
-                       ret = -EINVAL;
-                       goto e_free;
-               }
+               if (params.len > SEV_FW_BLOB_MAX_SIZE)
+                       return -EINVAL;
 
-               ret = -ENOMEM;
-               blob = kmalloc(params.len, GFP_KERNEL);
+               blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT);
                if (!blob)
-                       goto e_free;
+                       return -ENOMEM;
 
-               data->address = __psp_pa(blob);
-               data->len = params.len;
-               memcpy(data->mnonce, params.mnonce, sizeof(params.mnonce));
+               data.address = __psp_pa(blob);
+               data.len = params.len;
+               memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce));
        }
 cmd:
-       data->handle = sev->handle;
-       ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, data, &argp->error);
+       data.handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error);
        /*
         * If we query the session length, FW responded with expected data.
         */
@@ -1140,16 +1077,411 @@ cmd:
        }
 
 done:
-       params.len = data->len;
+       params.len = data.len;
        if (copy_to_user(report, &params, sizeof(params)))
                ret = -EFAULT;
 e_free_blob:
        kfree(blob);
-e_free:
-       kfree(data);
        return ret;
 }
 
+/* Userspace wants to query session length. */
+static int
+__sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
+                                     struct kvm_sev_send_start *params)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_send_start data;
+       int ret;
+
+       data.handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+       if (ret < 0)
+               return ret;
+
+       params->session_len = data.session_len;
+       if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+                               sizeof(struct kvm_sev_send_start)))
+               ret = -EFAULT;
+
+       return ret;
+}
+
+static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_send_start data;
+       struct kvm_sev_send_start params;
+       void *amd_certs, *session_data;
+       void *pdh_cert, *plat_certs;
+       int ret;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+                               sizeof(struct kvm_sev_send_start)))
+               return -EFAULT;
+
+       /* if session_len is zero, userspace wants to query the session length */
+       if (!params.session_len)
+               return __sev_send_start_query_session_length(kvm, argp,
+                               &params);
+
+       /* some sanity checks */
+       if (!params.pdh_cert_uaddr || !params.pdh_cert_len ||
+           !params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE)
+               return -EINVAL;
+
+       /* allocate the memory to hold the session data blob */
+       session_data = kmalloc(params.session_len, GFP_KERNEL_ACCOUNT);
+       if (!session_data)
+               return -ENOMEM;
+
+       /* copy the certificate blobs from userspace */
+       pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr,
+                               params.pdh_cert_len);
+       if (IS_ERR(pdh_cert)) {
+               ret = PTR_ERR(pdh_cert);
+               goto e_free_session;
+       }
+
+       plat_certs = psp_copy_user_blob(params.plat_certs_uaddr,
+                               params.plat_certs_len);
+       if (IS_ERR(plat_certs)) {
+               ret = PTR_ERR(plat_certs);
+               goto e_free_pdh;
+       }
+
+       amd_certs = psp_copy_user_blob(params.amd_certs_uaddr,
+                               params.amd_certs_len);
+       if (IS_ERR(amd_certs)) {
+               ret = PTR_ERR(amd_certs);
+               goto e_free_plat_cert;
+       }
+
+       /* populate the FW SEND_START field with system physical address */
+       memset(&data, 0, sizeof(data));
+       data.pdh_cert_address = __psp_pa(pdh_cert);
+       data.pdh_cert_len = params.pdh_cert_len;
+       data.plat_certs_address = __psp_pa(plat_certs);
+       data.plat_certs_len = params.plat_certs_len;
+       data.amd_certs_address = __psp_pa(amd_certs);
+       data.amd_certs_len = params.amd_certs_len;
+       data.session_address = __psp_pa(session_data);
+       data.session_len = params.session_len;
+       data.handle = sev->handle;
+
+       ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+
+       if (!ret && copy_to_user((void __user *)(uintptr_t)params.session_uaddr,
+                       session_data, params.session_len)) {
+               ret = -EFAULT;
+               goto e_free_amd_cert;
+       }
+
+       params.policy = data.policy;
+       params.session_len = data.session_len;
+       if (copy_to_user((void __user *)(uintptr_t)argp->data, &params,
+                               sizeof(struct kvm_sev_send_start)))
+               ret = -EFAULT;
+
+e_free_amd_cert:
+       kfree(amd_certs);
+e_free_plat_cert:
+       kfree(plat_certs);
+e_free_pdh:
+       kfree(pdh_cert);
+e_free_session:
+       kfree(session_data);
+       return ret;
+}
+
+/* Userspace wants to query either header or trans length. */
+static int
+__sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
+                                    struct kvm_sev_send_update_data *params)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_send_update_data data;
+       int ret;
+
+       data.handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+       if (ret < 0)
+               return ret;
+
+       params->hdr_len = data.hdr_len;
+       params->trans_len = data.trans_len;
+
+       if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+                        sizeof(struct kvm_sev_send_update_data)))
+               ret = -EFAULT;
+
+       return ret;
+}
+
+static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_send_update_data data;
+       struct kvm_sev_send_update_data params;
+       void *hdr, *trans_data;
+       struct page **guest_page;
+       unsigned long n;
+       int ret, offset;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+                       sizeof(struct kvm_sev_send_update_data)))
+               return -EFAULT;
+
+       /* userspace wants to query either header or trans length */
+       if (!params.trans_len || !params.hdr_len)
+               return __sev_send_update_data_query_lengths(kvm, argp, &params);
+
+       if (!params.trans_uaddr || !params.guest_uaddr ||
+           !params.guest_len || !params.hdr_uaddr)
+               return -EINVAL;
+
+       /* Check if we are crossing the page boundary */
+       offset = params.guest_uaddr & (PAGE_SIZE - 1);
+       if ((params.guest_len + offset > PAGE_SIZE))
+               return -EINVAL;
+
+       /* Pin guest memory */
+       guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+                                   PAGE_SIZE, &n, 0);
+       if (!guest_page)
+               return -EFAULT;
+
+       /* allocate memory for header and transport buffer */
+       ret = -ENOMEM;
+       hdr = kmalloc(params.hdr_len, GFP_KERNEL_ACCOUNT);
+       if (!hdr)
+               goto e_unpin;
+
+       trans_data = kmalloc(params.trans_len, GFP_KERNEL_ACCOUNT);
+       if (!trans_data)
+               goto e_free_hdr;
+
+       memset(&data, 0, sizeof(data));
+       data.hdr_address = __psp_pa(hdr);
+       data.hdr_len = params.hdr_len;
+       data.trans_address = __psp_pa(trans_data);
+       data.trans_len = params.trans_len;
+
+       /* The SEND_UPDATE_DATA command requires C-bit to be always set. */
+       data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+       data.guest_address |= sev_me_mask;
+       data.guest_len = params.guest_len;
+       data.handle = sev->handle;
+
+       ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+
+       if (ret)
+               goto e_free_trans_data;
+
+       /* copy transport buffer to user space */
+       if (copy_to_user((void __user *)(uintptr_t)params.trans_uaddr,
+                        trans_data, params.trans_len)) {
+               ret = -EFAULT;
+               goto e_free_trans_data;
+       }
+
+       /* Copy packet header to userspace. */
+       ret = copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr,
+                               params.hdr_len);
+
+e_free_trans_data:
+       kfree(trans_data);
+e_free_hdr:
+       kfree(hdr);
+e_unpin:
+       sev_unpin_memory(kvm, guest_page, n);
+
+       return ret;
+}
+
+static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_send_finish data;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       data.handle = sev->handle;
+       return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error);
+}
+
+static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_send_cancel data;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       data.handle = sev->handle;
+       return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error);
+}
+
+static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_receive_start start;
+       struct kvm_sev_receive_start params;
+       int *error = &argp->error;
+       void *session_data;
+       void *pdh_data;
+       int ret;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       /* Get parameter from the userspace */
+       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+                       sizeof(struct kvm_sev_receive_start)))
+               return -EFAULT;
+
+       /* some sanity checks */
+       if (!params.pdh_uaddr || !params.pdh_len ||
+           !params.session_uaddr || !params.session_len)
+               return -EINVAL;
+
+       pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len);
+       if (IS_ERR(pdh_data))
+               return PTR_ERR(pdh_data);
+
+       session_data = psp_copy_user_blob(params.session_uaddr,
+                       params.session_len);
+       if (IS_ERR(session_data)) {
+               ret = PTR_ERR(session_data);
+               goto e_free_pdh;
+       }
+
+       memset(&start, 0, sizeof(start));
+       start.handle = params.handle;
+       start.policy = params.policy;
+       start.pdh_cert_address = __psp_pa(pdh_data);
+       start.pdh_cert_len = params.pdh_len;
+       start.session_address = __psp_pa(session_data);
+       start.session_len = params.session_len;
+
+       /* create memory encryption context */
+       ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, &start,
+                               error);
+       if (ret)
+               goto e_free_session;
+
+       /* Bind ASID to this guest */
+       ret = sev_bind_asid(kvm, start.handle, error);
+       if (ret)
+               goto e_free_session;
+
+       params.handle = start.handle;
+       if (copy_to_user((void __user *)(uintptr_t)argp->data,
+                        &params, sizeof(struct kvm_sev_receive_start))) {
+               ret = -EFAULT;
+               sev_unbind_asid(kvm, start.handle);
+               goto e_free_session;
+       }
+
+       sev->handle = start.handle;
+       sev->fd = argp->sev_fd;
+
+e_free_session:
+       kfree(session_data);
+e_free_pdh:
+       kfree(pdh_data);
+
+       return ret;
+}
+
+static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct kvm_sev_receive_update_data params;
+       struct sev_data_receive_update_data data;
+       void *hdr = NULL, *trans = NULL;
+       struct page **guest_page;
+       unsigned long n;
+       int ret, offset;
+
+       if (!sev_guest(kvm))
+               return -EINVAL;
+
+       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+                       sizeof(struct kvm_sev_receive_update_data)))
+               return -EFAULT;
+
+       if (!params.hdr_uaddr || !params.hdr_len ||
+           !params.guest_uaddr || !params.guest_len ||
+           !params.trans_uaddr || !params.trans_len)
+               return -EINVAL;
+
+       /* Check if we are crossing the page boundary */
+       offset = params.guest_uaddr & (PAGE_SIZE - 1);
+       if ((params.guest_len + offset > PAGE_SIZE))
+               return -EINVAL;
+
+       hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+       if (IS_ERR(hdr))
+               return PTR_ERR(hdr);
+
+       trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto e_free_hdr;
+       }
+
+       memset(&data, 0, sizeof(data));
+       data.hdr_address = __psp_pa(hdr);
+       data.hdr_len = params.hdr_len;
+       data.trans_address = __psp_pa(trans);
+       data.trans_len = params.trans_len;
+
+       /* Pin guest memory */
+       ret = -EFAULT;
+       guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+                                   PAGE_SIZE, &n, 0);
+       if (!guest_page)
+               goto e_free_trans;
+
+       /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
+       data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+       data.guest_address |= sev_me_mask;
+       data.guest_len = params.guest_len;
+       data.handle = sev->handle;
+
+       ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
+                               &argp->error);
+
+       sev_unpin_memory(kvm, guest_page, n);
+
+e_free_trans:
+       kfree(trans);
+e_free_hdr:
+       kfree(hdr);
+
+       return ret;
+}
+
+static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_receive_finish data;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       data.handle = sev->handle;
+       return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error);
+}
+
 int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
        struct kvm_sev_cmd sev_cmd;
@@ -1166,13 +1498,22 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 
        mutex_lock(&kvm->lock);
 
+       /* enc_context_owner handles all memory enc operations */
+       if (is_mirroring_enc_context(kvm)) {
+               r = -EINVAL;
+               goto out;
+       }
+
        switch (sev_cmd.id) {
+       case KVM_SEV_ES_INIT:
+               if (!sev_es) {
+                       r = -ENOTTY;
+                       goto out;
+               }
+               fallthrough;
        case KVM_SEV_INIT:
                r = sev_guest_init(kvm, &sev_cmd);
                break;
-       case KVM_SEV_ES_INIT:
-               r = sev_es_guest_init(kvm, &sev_cmd);
-               break;
        case KVM_SEV_LAUNCH_START:
                r = sev_launch_start(kvm, &sev_cmd);
                break;
@@ -1203,6 +1544,27 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
        case KVM_SEV_GET_ATTESTATION_REPORT:
                r = sev_get_attestation_report(kvm, &sev_cmd);
                break;
+       case KVM_SEV_SEND_START:
+               r = sev_send_start(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_SEND_UPDATE_DATA:
+               r = sev_send_update_data(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_SEND_FINISH:
+               r = sev_send_finish(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_SEND_CANCEL:
+               r = sev_send_cancel(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_RECEIVE_START:
+               r = sev_receive_start(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_RECEIVE_UPDATE_DATA:
+               r = sev_receive_update_data(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_RECEIVE_FINISH:
+               r = sev_receive_finish(kvm, &sev_cmd);
+               break;
        default:
                r = -EINVAL;
                goto out;
@@ -1226,6 +1588,10 @@ int svm_register_enc_region(struct kvm *kvm,
        if (!sev_guest(kvm))
                return -ENOTTY;
 
+       /* If kvm is mirroring encryption context it isn't responsible for it */
+       if (is_mirroring_enc_context(kvm))
+               return -EINVAL;
+
        if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
                return -EINVAL;
 
@@ -1292,6 +1658,10 @@ int svm_unregister_enc_region(struct kvm *kvm,
        struct enc_region *region;
        int ret;
 
+       /* If kvm is mirroring encryption context it isn't responsible for it */
+       if (is_mirroring_enc_context(kvm))
+               return -EINVAL;
+
        mutex_lock(&kvm->lock);
 
        if (!sev_guest(kvm)) {
@@ -1322,6 +1692,71 @@ failed:
        return ret;
 }
 
+int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
+{
+       struct file *source_kvm_file;
+       struct kvm *source_kvm;
+       struct kvm_sev_info *mirror_sev;
+       unsigned int asid;
+       int ret;
+
+       source_kvm_file = fget(source_fd);
+       if (!file_is_kvm(source_kvm_file)) {
+               ret = -EBADF;
+               goto e_source_put;
+       }
+
+       source_kvm = source_kvm_file->private_data;
+       mutex_lock(&source_kvm->lock);
+
+       if (!sev_guest(source_kvm)) {
+               ret = -EINVAL;
+               goto e_source_unlock;
+       }
+
+       /* Mirrors of mirrors should work, but let's not get silly */
+       if (is_mirroring_enc_context(source_kvm) || source_kvm == kvm) {
+               ret = -EINVAL;
+               goto e_source_unlock;
+       }
+
+       asid = to_kvm_svm(source_kvm)->sev_info.asid;
+
+       /*
+        * The mirror kvm holds an enc_context_owner ref so its asid can't
+        * disappear until we're done with it
+        */
+       kvm_get_kvm(source_kvm);
+
+       fput(source_kvm_file);
+       mutex_unlock(&source_kvm->lock);
+       mutex_lock(&kvm->lock);
+
+       if (sev_guest(kvm)) {
+               ret = -EINVAL;
+               goto e_mirror_unlock;
+       }
+
+       /* Set enc_context_owner and copy its encryption context over */
+       mirror_sev = &to_kvm_svm(kvm)->sev_info;
+       mirror_sev->enc_context_owner = source_kvm;
+       mirror_sev->asid = asid;
+       mirror_sev->active = true;
+
+       mutex_unlock(&kvm->lock);
+       return 0;
+
+e_mirror_unlock:
+       mutex_unlock(&kvm->lock);
+       kvm_put_kvm(source_kvm);
+       return ret;
+e_source_unlock:
+       mutex_unlock(&source_kvm->lock);
+e_source_put:
+       fput(source_kvm_file);
+       return ret;
+}
+
 void sev_vm_destroy(struct kvm *kvm)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -1331,6 +1766,12 @@ void sev_vm_destroy(struct kvm *kvm)
        if (!sev_guest(kvm))
                return;
 
+       /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
+       if (is_mirroring_enc_context(kvm)) {
+               kvm_put_kvm(sev->enc_context_owner);
+               return;
+       }
+
        mutex_lock(&kvm->lock);
 
        /*
@@ -1382,6 +1823,7 @@ void __init sev_hardware_setup(void)
 
        /* Minimum ASID value that should be used for SEV guest */
        min_sev_asid = edx;
+       sev_me_mask = 1UL << (ebx & 0x3f);
 
        /* Initialize SEV ASID bitmaps */
        sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
@@ -1825,7 +2267,7 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
                               len, GHCB_SCRATCH_AREA_LIMIT);
                        return false;
                }
-               scratch_va = kzalloc(len, GFP_KERNEL);
+               scratch_va = kzalloc(len, GFP_KERNEL_ACCOUNT);
                if (!scratch_va)
                        return false;
 
@@ -1899,7 +2341,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
                vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn;
                vcpu->arch.regs[VCPU_REGS_RCX] = 0;
 
-               ret = svm_invoke_exit_handler(svm, SVM_EXIT_CPUID);
+               ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID);
                if (!ret) {
                        ret = -EINVAL;
                        break;
@@ -1949,8 +2391,9 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
        return ret;
 }
 
-int sev_handle_vmgexit(struct vcpu_svm *svm)
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb_control_area *control = &svm->vmcb->control;
        u64 ghcb_gpa, exit_code;
        struct ghcb *ghcb;
@@ -1962,13 +2405,13 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
                return sev_handle_vmgexit_msr_protocol(svm);
 
        if (!ghcb_gpa) {
-               vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB gpa is not set\n");
+               vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n");
                return -EINVAL;
        }
 
-       if (kvm_vcpu_map(&svm->vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) {
+       if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) {
                /* Unable to map GHCB from guest */
-               vcpu_unimpl(&svm->vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
+               vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
                            ghcb_gpa);
                return -EINVAL;
        }
@@ -1976,7 +2419,7 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
        svm->ghcb = svm->ghcb_map.hva;
        ghcb = svm->ghcb_map.hva;
 
-       trace_kvm_vmgexit_enter(svm->vcpu.vcpu_id, ghcb);
+       trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb);
 
        exit_code = ghcb_get_sw_exit_code(ghcb);
 
@@ -1994,7 +2437,7 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
                if (!setup_vmgexit_scratch(svm, true, control->exit_info_2))
                        break;
 
-               ret = kvm_sev_es_mmio_read(&svm->vcpu,
+               ret = kvm_sev_es_mmio_read(vcpu,
                                           control->exit_info_1,
                                           control->exit_info_2,
                                           svm->ghcb_sa);
@@ -2003,19 +2446,19 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
                if (!setup_vmgexit_scratch(svm, false, control->exit_info_2))
                        break;
 
-               ret = kvm_sev_es_mmio_write(&svm->vcpu,
+               ret = kvm_sev_es_mmio_write(vcpu,
                                            control->exit_info_1,
                                            control->exit_info_2,
                                            svm->ghcb_sa);
                break;
        case SVM_VMGEXIT_NMI_COMPLETE:
-               ret = svm_invoke_exit_handler(svm, SVM_EXIT_IRET);
+               ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET);
                break;
        case SVM_VMGEXIT_AP_HLT_LOOP:
-               ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
+               ret = kvm_emulate_ap_reset_hold(vcpu);
                break;
        case SVM_VMGEXIT_AP_JUMP_TABLE: {
-               struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+               struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
 
                switch (control->exit_info_1) {
                case 0:
@@ -2040,12 +2483,12 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
                break;
        }
        case SVM_VMGEXIT_UNSUPPORTED_EVENT:
-               vcpu_unimpl(&svm->vcpu,
+               vcpu_unimpl(vcpu,
                            "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
                            control->exit_info_1, control->exit_info_2);
                break;
        default:
-               ret = svm_invoke_exit_handler(svm, exit_code);
+               ret = svm_invoke_exit_handler(vcpu, exit_code);
        }
 
        return ret;
@@ -2154,5 +2597,8 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
         * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a
         * non-zero value.
         */
+       if (!svm->ghcb)
+               return;
+
        ghcb_set_sw_exit_info_2(svm->ghcb, 1);
 }
index 58a45bb..cd8c333 100644 (file)
@@ -56,9 +56,6 @@ static const struct x86_cpu_id svm_cpu_id[] = {
 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
 #endif
 
-#define IOPM_ALLOC_ORDER 2
-#define MSRPM_ALLOC_ORDER 1
-
 #define SEG_TYPE_LDT 2
 #define SEG_TYPE_BUSY_TSS16 3
 
@@ -95,6 +92,8 @@ static const struct svm_direct_access_msrs {
 } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
        { .index = MSR_STAR,                            .always = true  },
        { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
+       { .index = MSR_IA32_SYSENTER_EIP,               .always = false },
+       { .index = MSR_IA32_SYSENTER_ESP,               .always = false },
 #ifdef CONFIG_X86_64
        { .index = MSR_GS_BASE,                         .always = true  },
        { .index = MSR_FS_BASE,                         .always = true  },
@@ -279,7 +278,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
                         * In this case we will return to the nested guest
                         * as soon as we leave SMM.
                         */
-                       if (!is_smm(&svm->vcpu))
+                       if (!is_smm(vcpu))
                                svm_free_nested(svm);
 
                } else {
@@ -363,10 +362,10 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
        bool has_error_code = vcpu->arch.exception.has_error_code;
        u32 error_code = vcpu->arch.exception.error_code;
 
-       kvm_deliver_exception_payload(&svm->vcpu);
+       kvm_deliver_exception_payload(vcpu);
 
        if (nr == BP_VECTOR && !nrips) {
-               unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
+               unsigned long rip, old_rip = kvm_rip_read(vcpu);
 
                /*
                 * For guest debugging where we have to reinject #BP if some
@@ -375,8 +374,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
                 * raises a fault that is not intercepted. Still better than
                 * failing in all cases.
                 */
-               (void)skip_emulated_instruction(&svm->vcpu);
-               rip = kvm_rip_read(&svm->vcpu);
+               (void)skip_emulated_instruction(vcpu);
+               rip = kvm_rip_read(vcpu);
                svm->int3_rip = rip + svm->vmcb->save.cs.base;
                svm->int3_injected = rip - old_rip;
        }
@@ -681,14 +680,15 @@ void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
 
 u32 *svm_vcpu_alloc_msrpm(void)
 {
-       struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
+       unsigned int order = get_order(MSRPM_SIZE);
+       struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
        u32 *msrpm;
 
        if (!pages)
                return NULL;
 
        msrpm = page_address(pages);
-       memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+       memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
 
        return msrpm;
 }
@@ -707,7 +707,7 @@ void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
 
 void svm_vcpu_free_msrpm(u32 *msrpm)
 {
-       __free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER);
+       __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
 }
 
 static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
@@ -881,7 +881,7 @@ static __init void svm_adjust_mmio_mask(void)
         */
        mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
 
-       kvm_mmu_set_mmio_spte_mask(mask, PT_WRITABLE_MASK | PT_USER_MASK);
+       kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
 }
 
 static void svm_hardware_teardown(void)
@@ -894,7 +894,8 @@ static void svm_hardware_teardown(void)
        for_each_possible_cpu(cpu)
                svm_cpu_uninit(cpu);
 
-       __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
+       __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
+       get_order(IOPM_SIZE));
        iopm_base = 0;
 }
 
@@ -930,14 +931,15 @@ static __init int svm_hardware_setup(void)
        struct page *iopm_pages;
        void *iopm_va;
        int r;
+       unsigned int order = get_order(IOPM_SIZE);
 
-       iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
+       iopm_pages = alloc_pages(GFP_KERNEL, order);
 
        if (!iopm_pages)
                return -ENOMEM;
 
        iopm_va = page_address(iopm_pages);
-       memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
+       memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
        iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
 
        init_msrpm_offsets();
@@ -1084,8 +1086,8 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
        if (is_guest_mode(vcpu)) {
                /* Write L1's TSC offset.  */
                g_tsc_offset = svm->vmcb->control.tsc_offset -
-                              svm->nested.hsave->control.tsc_offset;
-               svm->nested.hsave->control.tsc_offset = offset;
+                              svm->vmcb01.ptr->control.tsc_offset;
+               svm->vmcb01.ptr->control.tsc_offset = offset;
        }
 
        trace_kvm_write_tsc_offset(vcpu->vcpu_id,
@@ -1113,12 +1115,13 @@ static void svm_check_invpcid(struct vcpu_svm *svm)
        }
 }
 
-static void init_vmcb(struct vcpu_svm *svm)
+static void init_vmcb(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb_control_area *control = &svm->vmcb->control;
        struct vmcb_save_area *save = &svm->vmcb->save;
 
-       svm->vcpu.arch.hflags = 0;
+       vcpu->arch.hflags = 0;
 
        svm_set_intercept(svm, INTERCEPT_CR0_READ);
        svm_set_intercept(svm, INTERCEPT_CR3_READ);
@@ -1126,7 +1129,7 @@ static void init_vmcb(struct vcpu_svm *svm)
        svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
        svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
        svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
-       if (!kvm_vcpu_apicv_active(&svm->vcpu))
+       if (!kvm_vcpu_apicv_active(vcpu))
                svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
 
        set_dr_intercepts(svm);
@@ -1170,12 +1173,12 @@ static void init_vmcb(struct vcpu_svm *svm)
        svm_set_intercept(svm, INTERCEPT_RDPRU);
        svm_set_intercept(svm, INTERCEPT_RSM);
 
-       if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
+       if (!kvm_mwait_in_guest(vcpu->kvm)) {
                svm_set_intercept(svm, INTERCEPT_MONITOR);
                svm_set_intercept(svm, INTERCEPT_MWAIT);
        }
 
-       if (!kvm_hlt_in_guest(svm->vcpu.kvm))
+       if (!kvm_hlt_in_guest(vcpu->kvm))
                svm_set_intercept(svm, INTERCEPT_HLT);
 
        control->iopm_base_pa = __sme_set(iopm_base);
@@ -1201,19 +1204,19 @@ static void init_vmcb(struct vcpu_svm *svm)
        init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
        init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
 
-       svm_set_cr4(&svm->vcpu, 0);
-       svm_set_efer(&svm->vcpu, 0);
+       svm_set_cr4(vcpu, 0);
+       svm_set_efer(vcpu, 0);
        save->dr6 = 0xffff0ff0;
-       kvm_set_rflags(&svm->vcpu, X86_EFLAGS_FIXED);
+       kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
        save->rip = 0x0000fff0;
-       svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
+       vcpu->arch.regs[VCPU_REGS_RIP] = save->rip;
 
        /*
         * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
         * It also updates the guest-visible cr0 value.
         */
-       svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
-       kvm_mmu_reset_context(&svm->vcpu);
+       svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
+       kvm_mmu_reset_context(vcpu);
 
        save->cr4 = X86_CR4_PAE;
        /* rdx = ?? */
@@ -1225,17 +1228,18 @@ static void init_vmcb(struct vcpu_svm *svm)
                clr_exception_intercept(svm, PF_VECTOR);
                svm_clr_intercept(svm, INTERCEPT_CR3_READ);
                svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
-               save->g_pat = svm->vcpu.arch.pat;
+               save->g_pat = vcpu->arch.pat;
                save->cr3 = 0;
                save->cr4 = 0;
        }
-       svm->asid_generation = 0;
+       svm->current_vmcb->asid_generation = 0;
        svm->asid = 0;
 
        svm->nested.vmcb12_gpa = 0;
-       svm->vcpu.arch.hflags = 0;
+       svm->nested.last_vmcb12_gpa = 0;
+       vcpu->arch.hflags = 0;
 
-       if (!kvm_pause_in_guest(svm->vcpu.kvm)) {
+       if (!kvm_pause_in_guest(vcpu->kvm)) {
                control->pause_filter_count = pause_filter_count;
                if (pause_filter_thresh)
                        control->pause_filter_thresh = pause_filter_thresh;
@@ -1246,18 +1250,15 @@ static void init_vmcb(struct vcpu_svm *svm)
 
        svm_check_invpcid(svm);
 
-       if (kvm_vcpu_apicv_active(&svm->vcpu))
-               avic_init_vmcb(svm);
-
        /*
-        * If hardware supports Virtual VMLOAD VMSAVE then enable it
-        * in VMCB and clear intercepts to avoid #VMEXIT.
+        * If the host supports V_SPEC_CTRL then disable the interception
+        * of MSR_IA32_SPEC_CTRL.
         */
-       if (vls) {
-               svm_clr_intercept(svm, INTERCEPT_VMLOAD);
-               svm_clr_intercept(svm, INTERCEPT_VMSAVE);
-               svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
-       }
+       if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+
+       if (kvm_vcpu_apicv_active(vcpu))
+               avic_init_vmcb(svm);
 
        if (vgif) {
                svm_clr_intercept(svm, INTERCEPT_STGI);
@@ -1265,11 +1266,11 @@ static void init_vmcb(struct vcpu_svm *svm)
                svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
        }
 
-       if (sev_guest(svm->vcpu.kvm)) {
+       if (sev_guest(vcpu->kvm)) {
                svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
                clr_exception_intercept(svm, UD_VECTOR);
 
-               if (sev_es_guest(svm->vcpu.kvm)) {
+               if (sev_es_guest(vcpu->kvm)) {
                        /* Perform SEV-ES specific VMCB updates */
                        sev_es_init_vmcb(svm);
                }
@@ -1291,12 +1292,12 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
        svm->virt_spec_ctrl = 0;
 
        if (!init_event) {
-               svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
-                                          MSR_IA32_APICBASE_ENABLE;
-               if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
-                       svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+               vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+                                      MSR_IA32_APICBASE_ENABLE;
+               if (kvm_vcpu_is_reset_bsp(vcpu))
+                       vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
        }
-       init_vmcb(svm);
+       init_vmcb(vcpu);
 
        kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
        kvm_rdx_write(vcpu, eax);
@@ -1305,10 +1306,16 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
                avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
 }
 
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
+{
+       svm->current_vmcb = target_vmcb;
+       svm->vmcb = target_vmcb->ptr;
+}
+
 static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm;
-       struct page *vmcb_page;
+       struct page *vmcb01_page;
        struct page *vmsa_page = NULL;
        int err;
 
@@ -1316,11 +1323,11 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
        svm = to_svm(vcpu);
 
        err = -ENOMEM;
-       vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
-       if (!vmcb_page)
+       vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+       if (!vmcb01_page)
                goto out;
 
-       if (sev_es_guest(svm->vcpu.kvm)) {
+       if (sev_es_guest(vcpu->kvm)) {
                /*
                 * SEV-ES guests require a separate VMSA page used to contain
                 * the encrypted register state of the guest.
@@ -1356,20 +1363,21 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 
        svm_vcpu_init_msrpm(vcpu, svm->msrpm);
 
-       svm->vmcb = page_address(vmcb_page);
-       svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT);
+       svm->vmcb01.ptr = page_address(vmcb01_page);
+       svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
 
        if (vmsa_page)
                svm->vmsa = page_address(vmsa_page);
 
-       svm->asid_generation = 0;
        svm->guest_state_loaded = false;
-       init_vmcb(svm);
+
+       svm_switch_vmcb(svm, &svm->vmcb01);
+       init_vmcb(vcpu);
 
        svm_init_osvw(vcpu);
        vcpu->arch.microcode_version = 0x01000065;
 
-       if (sev_es_guest(svm->vcpu.kvm))
+       if (sev_es_guest(vcpu->kvm))
                /* Perform SEV-ES specific VMCB creation updates */
                sev_es_create_vcpu(svm);
 
@@ -1379,7 +1387,7 @@ error_free_vmsa_page:
        if (vmsa_page)
                __free_page(vmsa_page);
 error_free_vmcb_page:
-       __free_page(vmcb_page);
+       __free_page(vmcb01_page);
 out:
        return err;
 }
@@ -1407,8 +1415,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 
        sev_free_vcpu(vcpu);
 
-       __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
-       __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
+       __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
+       __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
 }
 
 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
@@ -1432,7 +1440,7 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
         * Save additional host state that will be restored on VMEXIT (sev-es)
         * or subsequent vmload of host save area.
         */
-       if (sev_es_guest(svm->vcpu.kvm)) {
+       if (sev_es_guest(vcpu->kvm)) {
                sev_es_prepare_guest_switch(svm, vcpu->cpu);
        } else {
                vmsave(__sme_page_pa(sd->save_area));
@@ -1476,11 +1484,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        struct vcpu_svm *svm = to_svm(vcpu);
        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
 
-       if (unlikely(cpu != vcpu->cpu)) {
-               svm->asid_generation = 0;
-               vmcb_mark_all_dirty(svm->vmcb);
-       }
-
        if (sd->current_vmcb != svm->vmcb) {
                sd->current_vmcb = svm->vmcb;
                indirect_branch_prediction_barrier();
@@ -1564,7 +1567,7 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
        /* Drop int_ctl fields related to VINTR injection.  */
        svm->vmcb->control.int_ctl &= mask;
        if (is_guest_mode(&svm->vcpu)) {
-               svm->nested.hsave->control.int_ctl &= mask;
+               svm->vmcb01.ptr->control.int_ctl &= mask;
 
                WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
                        (svm->nested.ctl.int_ctl & V_TPR_MASK));
@@ -1577,16 +1580,17 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
 {
        struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+       struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
 
        switch (seg) {
        case VCPU_SREG_CS: return &save->cs;
        case VCPU_SREG_DS: return &save->ds;
        case VCPU_SREG_ES: return &save->es;
-       case VCPU_SREG_FS: return &save->fs;
-       case VCPU_SREG_GS: return &save->gs;
+       case VCPU_SREG_FS: return &save01->fs;
+       case VCPU_SREG_GS: return &save01->gs;
        case VCPU_SREG_SS: return &save->ss;
-       case VCPU_SREG_TR: return &save->tr;
-       case VCPU_SREG_LDTR: return &save->ldtr;
+       case VCPU_SREG_TR: return &save01->tr;
+       case VCPU_SREG_LDTR: return &save01->ldtr;
        }
        BUG();
        return NULL;
@@ -1709,37 +1713,10 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
        vmcb_mark_dirty(svm->vmcb, VMCB_DT);
 }
 
-static void update_cr0_intercept(struct vcpu_svm *svm)
-{
-       ulong gcr0;
-       u64 *hcr0;
-
-       /*
-        * SEV-ES guests must always keep the CR intercepts cleared. CR
-        * tracking is done using the CR write traps.
-        */
-       if (sev_es_guest(svm->vcpu.kvm))
-               return;
-
-       gcr0 = svm->vcpu.arch.cr0;
-       hcr0 = &svm->vmcb->save.cr0;
-       *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
-               | (gcr0 & SVM_CR0_SELECTIVE_MASK);
-
-       vmcb_mark_dirty(svm->vmcb, VMCB_CR);
-
-       if (gcr0 == *hcr0) {
-               svm_clr_intercept(svm, INTERCEPT_CR0_READ);
-               svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
-       } else {
-               svm_set_intercept(svm, INTERCEPT_CR0_READ);
-               svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
-       }
-}
-
 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
+       u64 hcr0 = cr0;
 
 #ifdef CONFIG_X86_64
        if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
@@ -1757,7 +1734,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        vcpu->arch.cr0 = cr0;
 
        if (!npt_enabled)
-               cr0 |= X86_CR0_PG | X86_CR0_WP;
+               hcr0 |= X86_CR0_PG | X86_CR0_WP;
 
        /*
         * re-enable caching here because the QEMU bios
@@ -1765,10 +1742,26 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
         * reboot
         */
        if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
-               cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
-       svm->vmcb->save.cr0 = cr0;
+               hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+
+       svm->vmcb->save.cr0 = hcr0;
        vmcb_mark_dirty(svm->vmcb, VMCB_CR);
-       update_cr0_intercept(svm);
+
+       /*
+        * SEV-ES guests must always keep the CR intercepts cleared. CR
+        * tracking is done using the CR write traps.
+        */
+       if (sev_es_guest(vcpu->kvm))
+               return;
+
+       if (hcr0 == cr0) {
+               /* Selective CR0 write remains on.  */
+               svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+               svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
+       } else {
+               svm_set_intercept(svm, INTERCEPT_CR0_READ);
+               svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
+       }
 }
 
 static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1847,7 +1840,7 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
                vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
        }
 
-       svm->asid_generation = sd->asid_generation;
+       svm->current_vmcb->asid_generation = sd->asid_generation;
        svm->asid = sd->next_asid++;
 }
 
@@ -1896,39 +1889,43 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
        vmcb_mark_dirty(svm->vmcb, VMCB_DR);
 }
 
-static int pf_interception(struct vcpu_svm *svm)
+static int pf_interception(struct kvm_vcpu *vcpu)
 {
-       u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       u64 fault_address = svm->vmcb->control.exit_info_2;
        u64 error_code = svm->vmcb->control.exit_info_1;
 
-       return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
+       return kvm_handle_page_fault(vcpu, error_code, fault_address,
                        static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
                        svm->vmcb->control.insn_bytes : NULL,
                        svm->vmcb->control.insn_len);
 }
 
-static int npf_interception(struct vcpu_svm *svm)
+static int npf_interception(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
+
        u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
        u64 error_code = svm->vmcb->control.exit_info_1;
 
        trace_kvm_page_fault(fault_address, error_code);
-       return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+       return kvm_mmu_page_fault(vcpu, fault_address, error_code,
                        static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
                        svm->vmcb->control.insn_bytes : NULL,
                        svm->vmcb->control.insn_len);
 }
 
-static int db_interception(struct vcpu_svm *svm)
+static int db_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_run *kvm_run = svm->vcpu.run;
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct kvm_run *kvm_run = vcpu->run;
+       struct vcpu_svm *svm = to_svm(vcpu);
 
-       if (!(svm->vcpu.guest_debug &
+       if (!(vcpu->guest_debug &
              (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
                !svm->nmi_singlestep) {
                u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
-               kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload);
+               kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
                return 1;
        }
 
@@ -1938,7 +1935,7 @@ static int db_interception(struct vcpu_svm *svm)
                kvm_make_request(KVM_REQ_EVENT, vcpu);
        }
 
-       if (svm->vcpu.guest_debug &
+       if (vcpu->guest_debug &
            (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
                kvm_run->exit_reason = KVM_EXIT_DEBUG;
                kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
@@ -1952,9 +1949,10 @@ static int db_interception(struct vcpu_svm *svm)
        return 1;
 }
 
-static int bp_interception(struct vcpu_svm *svm)
+static int bp_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_run *kvm_run = svm->vcpu.run;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct kvm_run *kvm_run = vcpu->run;
 
        kvm_run->exit_reason = KVM_EXIT_DEBUG;
        kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
@@ -1962,14 +1960,14 @@ static int bp_interception(struct vcpu_svm *svm)
        return 0;
 }
 
-static int ud_interception(struct vcpu_svm *svm)
+static int ud_interception(struct kvm_vcpu *vcpu)
 {
-       return handle_ud(&svm->vcpu);
+       return handle_ud(vcpu);
 }
 
-static int ac_interception(struct vcpu_svm *svm)
+static int ac_interception(struct kvm_vcpu *vcpu)
 {
-       kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
+       kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
        return 1;
 }
 
@@ -2012,7 +2010,7 @@ static bool is_erratum_383(void)
        return true;
 }
 
-static void svm_handle_mce(struct vcpu_svm *svm)
+static void svm_handle_mce(struct kvm_vcpu *vcpu)
 {
        if (is_erratum_383()) {
                /*
@@ -2021,7 +2019,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
                 */
                pr_err("KVM: Guest triggered AMD Erratum 383\n");
 
-               kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
+               kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 
                return;
        }
@@ -2033,20 +2031,21 @@ static void svm_handle_mce(struct vcpu_svm *svm)
        kvm_machine_check();
 }
 
-static int mc_interception(struct vcpu_svm *svm)
+static int mc_interception(struct kvm_vcpu *vcpu)
 {
        return 1;
 }
 
-static int shutdown_interception(struct vcpu_svm *svm)
+static int shutdown_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_run *kvm_run = svm->vcpu.run;
+       struct kvm_run *kvm_run = vcpu->run;
+       struct vcpu_svm *svm = to_svm(vcpu);
 
        /*
         * The VM save area has already been encrypted so it
         * cannot be reinitialized - just terminate.
         */
-       if (sev_es_guest(svm->vcpu.kvm))
+       if (sev_es_guest(vcpu->kvm))
                return -EINVAL;
 
        /*
@@ -2054,20 +2053,20 @@ static int shutdown_interception(struct vcpu_svm *svm)
         * so reinitialize it.
         */
        clear_page(svm->vmcb);
-       init_vmcb(svm);
+       init_vmcb(vcpu);
 
        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
        return 0;
 }
 
-static int io_interception(struct vcpu_svm *svm)
+static int io_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct vcpu_svm *svm = to_svm(vcpu);
        u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
        int size, in, string;
        unsigned port;
 
-       ++svm->vcpu.stat.io_exits;
+       ++vcpu->stat.io_exits;
        string = (io_info & SVM_IOIO_STR_MASK) != 0;
        in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
        port = io_info >> 16;
@@ -2082,93 +2081,69 @@ static int io_interception(struct vcpu_svm *svm)
 
        svm->next_rip = svm->vmcb->control.exit_info_2;
 
-       return kvm_fast_pio(&svm->vcpu, size, port, in);
-}
-
-static int nmi_interception(struct vcpu_svm *svm)
-{
-       return 1;
+       return kvm_fast_pio(vcpu, size, port, in);
 }
 
-static int intr_interception(struct vcpu_svm *svm)
+static int nmi_interception(struct kvm_vcpu *vcpu)
 {
-       ++svm->vcpu.stat.irq_exits;
        return 1;
 }
 
-static int nop_on_interception(struct vcpu_svm *svm)
+static int intr_interception(struct kvm_vcpu *vcpu)
 {
+       ++vcpu->stat.irq_exits;
        return 1;
 }
 
-static int halt_interception(struct vcpu_svm *svm)
+static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
 {
-       return kvm_emulate_halt(&svm->vcpu);
-}
-
-static int vmmcall_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_hypercall(&svm->vcpu);
-}
-
-static int vmload_interception(struct vcpu_svm *svm)
-{
-       struct vmcb *nested_vmcb;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct vmcb *vmcb12;
        struct kvm_host_map map;
        int ret;
 
-       if (nested_svm_check_permissions(svm))
+       if (nested_svm_check_permissions(vcpu))
                return 1;
 
-       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+       ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
        if (ret) {
                if (ret == -EINVAL)
-                       kvm_inject_gp(&svm->vcpu, 0);
+                       kvm_inject_gp(vcpu, 0);
                return 1;
        }
 
-       nested_vmcb = map.hva;
+       vmcb12 = map.hva;
 
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
+       ret = kvm_skip_emulated_instruction(vcpu);
 
-       nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       if (vmload) {
+               nested_svm_vmloadsave(vmcb12, svm->vmcb);
+               svm->sysenter_eip_hi = 0;
+               svm->sysenter_esp_hi = 0;
+       } else
+               nested_svm_vmloadsave(svm->vmcb, vmcb12);
+
+       kvm_vcpu_unmap(vcpu, &map, true);
 
        return ret;
 }
 
-static int vmsave_interception(struct vcpu_svm *svm)
+static int vmload_interception(struct kvm_vcpu *vcpu)
 {
-       struct vmcb *nested_vmcb;
-       struct kvm_host_map map;
-       int ret;
-
-       if (nested_svm_check_permissions(svm))
-               return 1;
-
-       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
-       if (ret) {
-               if (ret == -EINVAL)
-                       kvm_inject_gp(&svm->vcpu, 0);
-               return 1;
-       }
-
-       nested_vmcb = map.hva;
-
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
-
-       nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       return vmload_vmsave_interception(vcpu, true);
+}
 
-       return ret;
+static int vmsave_interception(struct kvm_vcpu *vcpu)
+{
+       return vmload_vmsave_interception(vcpu, false);
 }
 
-static int vmrun_interception(struct vcpu_svm *svm)
+static int vmrun_interception(struct kvm_vcpu *vcpu)
 {
-       if (nested_svm_check_permissions(svm))
+       if (nested_svm_check_permissions(vcpu))
                return 1;
 
-       return nested_svm_vmrun(svm);
+       return nested_svm_vmrun(vcpu);
 }
 
 enum {
@@ -2207,7 +2182,7 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
                [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
                [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
        };
-       int (*const svm_instr_handlers[])(struct vcpu_svm *svm) = {
+       int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
                [SVM_INSTR_VMRUN] = vmrun_interception,
                [SVM_INSTR_VMLOAD] = vmload_interception,
                [SVM_INSTR_VMSAVE] = vmsave_interception,
@@ -2216,17 +2191,13 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
        int ret;
 
        if (is_guest_mode(vcpu)) {
-               svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode];
-               svm->vmcb->control.exit_info_1 = 0;
-               svm->vmcb->control.exit_info_2 = 0;
-
                /* Returns '1' or -errno on failure, '0' on success. */
-               ret = nested_svm_vmexit(svm);
+               ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
                if (ret)
                        return ret;
                return 1;
        }
-       return svm_instr_handlers[opcode](svm);
+       return svm_instr_handlers[opcode](vcpu);
 }
 
 /*
@@ -2237,9 +2208,9 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
  *      regions (e.g. SMM memory on host).
  *   2) VMware backdoor
  */
-static int gp_interception(struct vcpu_svm *svm)
+static int gp_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct vcpu_svm *svm = to_svm(vcpu);
        u32 error_code = svm->vmcb->control.exit_info_1;
        int opcode;
 
@@ -2304,73 +2275,52 @@ void svm_set_gif(struct vcpu_svm *svm, bool value)
        }
 }
 
-static int stgi_interception(struct vcpu_svm *svm)
+static int stgi_interception(struct kvm_vcpu *vcpu)
 {
        int ret;
 
-       if (nested_svm_check_permissions(svm))
+       if (nested_svm_check_permissions(vcpu))
                return 1;
 
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
-       svm_set_gif(svm, true);
+       ret = kvm_skip_emulated_instruction(vcpu);
+       svm_set_gif(to_svm(vcpu), true);
        return ret;
 }
 
-static int clgi_interception(struct vcpu_svm *svm)
+static int clgi_interception(struct kvm_vcpu *vcpu)
 {
        int ret;
 
-       if (nested_svm_check_permissions(svm))
+       if (nested_svm_check_permissions(vcpu))
                return 1;
 
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
-       svm_set_gif(svm, false);
+       ret = kvm_skip_emulated_instruction(vcpu);
+       svm_set_gif(to_svm(vcpu), false);
        return ret;
 }
 
-static int invlpga_interception(struct vcpu_svm *svm)
+static int invlpga_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
-
-       trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu),
-                         kvm_rax_read(&svm->vcpu));
+       trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, kvm_rcx_read(vcpu),
+                         kvm_rax_read(vcpu));
 
        /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
-       kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu));
-
-       return kvm_skip_emulated_instruction(&svm->vcpu);
-}
+       kvm_mmu_invlpg(vcpu, kvm_rax_read(vcpu));
 
-static int skinit_interception(struct vcpu_svm *svm)
-{
-       trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu));
-
-       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
-       return 1;
-}
-
-static int wbinvd_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_wbinvd(&svm->vcpu);
+       return kvm_skip_emulated_instruction(vcpu);
 }
 
-static int xsetbv_interception(struct vcpu_svm *svm)
+static int skinit_interception(struct kvm_vcpu *vcpu)
 {
-       u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
-       u32 index = kvm_rcx_read(&svm->vcpu);
+       trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
 
-       int err = kvm_set_xcr(&svm->vcpu, index, new_bv);
-       return kvm_complete_insn_gp(&svm->vcpu, err);
-}
-
-static int rdpru_interception(struct vcpu_svm *svm)
-{
-       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       kvm_queue_exception(vcpu, UD_VECTOR);
        return 1;
 }
 
-static int task_switch_interception(struct vcpu_svm *svm)
+static int task_switch_interception(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        u16 tss_selector;
        int reason;
        int int_type = svm->vmcb->control.exit_int_info &
@@ -2399,7 +2349,7 @@ static int task_switch_interception(struct vcpu_svm *svm)
        if (reason == TASK_SWITCH_GATE) {
                switch (type) {
                case SVM_EXITINTINFO_TYPE_NMI:
-                       svm->vcpu.arch.nmi_injected = false;
+                       vcpu->arch.nmi_injected = false;
                        break;
                case SVM_EXITINTINFO_TYPE_EXEPT:
                        if (svm->vmcb->control.exit_info_2 &
@@ -2408,10 +2358,10 @@ static int task_switch_interception(struct vcpu_svm *svm)
                                error_code =
                                        (u32)svm->vmcb->control.exit_info_2;
                        }
-                       kvm_clear_exception_queue(&svm->vcpu);
+                       kvm_clear_exception_queue(vcpu);
                        break;
                case SVM_EXITINTINFO_TYPE_INTR:
-                       kvm_clear_interrupt_queue(&svm->vcpu);
+                       kvm_clear_interrupt_queue(vcpu);
                        break;
                default:
                        break;
@@ -2422,77 +2372,58 @@ static int task_switch_interception(struct vcpu_svm *svm)
            int_type == SVM_EXITINTINFO_TYPE_SOFT ||
            (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
             (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
-               if (!skip_emulated_instruction(&svm->vcpu))
+               if (!skip_emulated_instruction(vcpu))
                        return 0;
        }
 
        if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
                int_vec = -1;
 
-       return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
+       return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
                               has_error_code, error_code);
 }
 
-static int cpuid_interception(struct vcpu_svm *svm)
+static int iret_interception(struct kvm_vcpu *vcpu)
 {
-       return kvm_emulate_cpuid(&svm->vcpu);
-}
+       struct vcpu_svm *svm = to_svm(vcpu);
 
-static int iret_interception(struct vcpu_svm *svm)
-{
-       ++svm->vcpu.stat.nmi_window_exits;
-       svm->vcpu.arch.hflags |= HF_IRET_MASK;
-       if (!sev_es_guest(svm->vcpu.kvm)) {
+       ++vcpu->stat.nmi_window_exits;
+       vcpu->arch.hflags |= HF_IRET_MASK;
+       if (!sev_es_guest(vcpu->kvm)) {
                svm_clr_intercept(svm, INTERCEPT_IRET);
-               svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
+               svm->nmi_iret_rip = kvm_rip_read(vcpu);
        }
-       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
        return 1;
 }
 
-static int invd_interception(struct vcpu_svm *svm)
-{
-       /* Treat an INVD instruction as a NOP and just skip it. */
-       return kvm_skip_emulated_instruction(&svm->vcpu);
-}
-
-static int invlpg_interception(struct vcpu_svm *svm)
+static int invlpg_interception(struct kvm_vcpu *vcpu)
 {
        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
-               return kvm_emulate_instruction(&svm->vcpu, 0);
-
-       kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
-       return kvm_skip_emulated_instruction(&svm->vcpu);
-}
+               return kvm_emulate_instruction(vcpu, 0);
 
-static int emulate_on_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_instruction(&svm->vcpu, 0);
+       kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
+       return kvm_skip_emulated_instruction(vcpu);
 }
 
-static int rsm_interception(struct vcpu_svm *svm)
+static int emulate_on_interception(struct kvm_vcpu *vcpu)
 {
-       return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2);
+       return kvm_emulate_instruction(vcpu, 0);
 }
 
-static int rdpmc_interception(struct vcpu_svm *svm)
+static int rsm_interception(struct kvm_vcpu *vcpu)
 {
-       int err;
-
-       if (!nrips)
-               return emulate_on_interception(svm);
-
-       err = kvm_rdpmc(&svm->vcpu);
-       return kvm_complete_insn_gp(&svm->vcpu, err);
+       return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
 }
 
-static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
+static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
                                            unsigned long val)
 {
-       unsigned long cr0 = svm->vcpu.arch.cr0;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       unsigned long cr0 = vcpu->arch.cr0;
        bool ret = false;
 
-       if (!is_guest_mode(&svm->vcpu) ||
+       if (!is_guest_mode(vcpu) ||
            (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
                return false;
 
@@ -2509,17 +2440,18 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
 
 #define CR_VALID (1ULL << 63)
 
-static int cr_interception(struct vcpu_svm *svm)
+static int cr_interception(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        int reg, cr;
        unsigned long val;
        int err;
 
        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
-               return emulate_on_interception(svm);
+               return emulate_on_interception(vcpu);
 
        if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
-               return emulate_on_interception(svm);
+               return emulate_on_interception(vcpu);
 
        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
        if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
@@ -2530,61 +2462,61 @@ static int cr_interception(struct vcpu_svm *svm)
        err = 0;
        if (cr >= 16) { /* mov to cr */
                cr -= 16;
-               val = kvm_register_read(&svm->vcpu, reg);
+               val = kvm_register_read(vcpu, reg);
                trace_kvm_cr_write(cr, val);
                switch (cr) {
                case 0:
-                       if (!check_selective_cr0_intercepted(svm, val))
-                               err = kvm_set_cr0(&svm->vcpu, val);
+                       if (!check_selective_cr0_intercepted(vcpu, val))
+                               err = kvm_set_cr0(vcpu, val);
                        else
                                return 1;
 
                        break;
                case 3:
-                       err = kvm_set_cr3(&svm->vcpu, val);
+                       err = kvm_set_cr3(vcpu, val);
                        break;
                case 4:
-                       err = kvm_set_cr4(&svm->vcpu, val);
+                       err = kvm_set_cr4(vcpu, val);
                        break;
                case 8:
-                       err = kvm_set_cr8(&svm->vcpu, val);
+                       err = kvm_set_cr8(vcpu, val);
                        break;
                default:
                        WARN(1, "unhandled write to CR%d", cr);
-                       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+                       kvm_queue_exception(vcpu, UD_VECTOR);
                        return 1;
                }
        } else { /* mov from cr */
                switch (cr) {
                case 0:
-                       val = kvm_read_cr0(&svm->vcpu);
+                       val = kvm_read_cr0(vcpu);
                        break;
                case 2:
-                       val = svm->vcpu.arch.cr2;
+                       val = vcpu->arch.cr2;
                        break;
                case 3:
-                       val = kvm_read_cr3(&svm->vcpu);
+                       val = kvm_read_cr3(vcpu);
                        break;
                case 4:
-                       val = kvm_read_cr4(&svm->vcpu);
+                       val = kvm_read_cr4(vcpu);
                        break;
                case 8:
-                       val = kvm_get_cr8(&svm->vcpu);
+                       val = kvm_get_cr8(vcpu);
                        break;
                default:
                        WARN(1, "unhandled read from CR%d", cr);
-                       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+                       kvm_queue_exception(vcpu, UD_VECTOR);
                        return 1;
                }
-               kvm_register_write(&svm->vcpu, reg, val);
+               kvm_register_write(vcpu, reg, val);
                trace_kvm_cr_read(cr, val);
        }
-       return kvm_complete_insn_gp(&svm->vcpu, err);
+       return kvm_complete_insn_gp(vcpu, err);
 }
 
-static int cr_trap(struct vcpu_svm *svm)
+static int cr_trap(struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct vcpu_svm *svm = to_svm(vcpu);
        unsigned long old_value, new_value;
        unsigned int cr;
        int ret = 0;
@@ -2606,7 +2538,7 @@ static int cr_trap(struct vcpu_svm *svm)
                kvm_post_set_cr4(vcpu, old_value, new_value);
                break;
        case 8:
-               ret = kvm_set_cr8(&svm->vcpu, new_value);
+               ret = kvm_set_cr8(vcpu, new_value);
                break;
        default:
                WARN(1, "unhandled CR%d write trap", cr);
@@ -2617,57 +2549,57 @@ static int cr_trap(struct vcpu_svm *svm)
        return kvm_complete_insn_gp(vcpu, ret);
 }
 
-static int dr_interception(struct vcpu_svm *svm)
+static int dr_interception(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        int reg, dr;
        unsigned long val;
        int err = 0;
 
-       if (svm->vcpu.guest_debug == 0) {
+       if (vcpu->guest_debug == 0) {
                /*
                 * No more DR vmexits; force a reload of the debug registers
                 * and reenter on this instruction.  The next vmexit will
                 * retrieve the full state of the debug registers.
                 */
                clr_dr_intercepts(svm);
-               svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+               vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
                return 1;
        }
 
        if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
-               return emulate_on_interception(svm);
+               return emulate_on_interception(vcpu);
 
        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
        dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
        if (dr >= 16) { /* mov to DRn  */
                dr -= 16;
-               val = kvm_register_read(&svm->vcpu, reg);
-               err = kvm_set_dr(&svm->vcpu, dr, val);
+               val = kvm_register_read(vcpu, reg);
+               err = kvm_set_dr(vcpu, dr, val);
        } else {
-               kvm_get_dr(&svm->vcpu, dr, &val);
-               kvm_register_write(&svm->vcpu, reg, val);
+               kvm_get_dr(vcpu, dr, &val);
+               kvm_register_write(vcpu, reg, val);
        }
 
-       return kvm_complete_insn_gp(&svm->vcpu, err);
+       return kvm_complete_insn_gp(vcpu, err);
 }
 
-static int cr8_write_interception(struct vcpu_svm *svm)
+static int cr8_write_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_run *kvm_run = svm->vcpu.run;
        int r;
 
-       u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
+       u8 cr8_prev = kvm_get_cr8(vcpu);
        /* instruction emulation calls kvm_set_cr8() */
-       r = cr_interception(svm);
-       if (lapic_in_kernel(&svm->vcpu))
+       r = cr_interception(vcpu);
+       if (lapic_in_kernel(vcpu))
                return r;
-       if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
+       if (cr8_prev <= kvm_get_cr8(vcpu))
                return r;
-       kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+       vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
        return 0;
 }
 
-static int efer_trap(struct vcpu_svm *svm)
+static int efer_trap(struct kvm_vcpu *vcpu)
 {
        struct msr_data msr_info;
        int ret;
@@ -2680,10 +2612,10 @@ static int efer_trap(struct vcpu_svm *svm)
         */
        msr_info.host_initiated = false;
        msr_info.index = MSR_EFER;
-       msr_info.data = svm->vmcb->control.exit_info_1 & ~EFER_SVME;
-       ret = kvm_set_msr_common(&svm->vcpu, &msr_info);
+       msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
+       ret = kvm_set_msr_common(vcpu, &msr_info);
 
-       return kvm_complete_insn_gp(&svm->vcpu, ret);
+       return kvm_complete_insn_gp(vcpu, ret);
 }
 
 static int svm_get_msr_feature(struct kvm_msr_entry *msr)
@@ -2710,30 +2642,34 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
        switch (msr_info->index) {
        case MSR_STAR:
-               msr_info->data = svm->vmcb->save.star;
+               msr_info->data = svm->vmcb01.ptr->save.star;
                break;
 #ifdef CONFIG_X86_64
        case MSR_LSTAR:
-               msr_info->data = svm->vmcb->save.lstar;
+               msr_info->data = svm->vmcb01.ptr->save.lstar;
                break;
        case MSR_CSTAR:
-               msr_info->data = svm->vmcb->save.cstar;
+               msr_info->data = svm->vmcb01.ptr->save.cstar;
                break;
        case MSR_KERNEL_GS_BASE:
-               msr_info->data = svm->vmcb->save.kernel_gs_base;
+               msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
                break;
        case MSR_SYSCALL_MASK:
-               msr_info->data = svm->vmcb->save.sfmask;
+               msr_info->data = svm->vmcb01.ptr->save.sfmask;
                break;
 #endif
        case MSR_IA32_SYSENTER_CS:
-               msr_info->data = svm->vmcb->save.sysenter_cs;
+               msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
                break;
        case MSR_IA32_SYSENTER_EIP:
-               msr_info->data = svm->sysenter_eip;
+               msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
+               if (guest_cpuid_is_intel(vcpu))
+                       msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
                break;
        case MSR_IA32_SYSENTER_ESP:
-               msr_info->data = svm->sysenter_esp;
+               msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
+               if (guest_cpuid_is_intel(vcpu))
+                       msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
                break;
        case MSR_TSC_AUX:
                if (!boot_cpu_has(X86_FEATURE_RDTSCP))
@@ -2771,7 +2707,10 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                    !guest_has_spec_ctrl_msr(vcpu))
                        return 1;
 
-               msr_info->data = svm->spec_ctrl;
+               if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+                       msr_info->data = svm->vmcb->save.spec_ctrl;
+               else
+                       msr_info->data = svm->spec_ctrl;
                break;
        case MSR_AMD64_VIRT_SPEC_CTRL:
                if (!msr_info->host_initiated &&
@@ -2809,8 +2748,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       if (!sev_es_guest(svm->vcpu.kvm) || !err)
-               return kvm_complete_insn_gp(&svm->vcpu, err);
+       if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->ghcb))
+               return kvm_complete_insn_gp(vcpu, err);
 
        ghcb_set_sw_exit_info_1(svm->ghcb, 1);
        ghcb_set_sw_exit_info_2(svm->ghcb,
@@ -2820,11 +2759,6 @@ static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
        return 1;
 }
 
-static int rdmsr_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_rdmsr(&svm->vcpu);
-}
-
 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -2861,7 +2795,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
                        return 1;
                vcpu->arch.pat = data;
-               svm->vmcb->save.g_pat = data;
+               svm->vmcb01.ptr->save.g_pat = data;
+               if (is_guest_mode(vcpu))
+                       nested_vmcb02_compute_g_pat(svm);
                vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
                break;
        case MSR_IA32_SPEC_CTRL:
@@ -2872,7 +2808,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                if (kvm_spec_ctrl_test_value(data))
                        return 1;
 
-               svm->spec_ctrl = data;
+               if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+                       svm->vmcb->save.spec_ctrl = data;
+               else
+                       svm->spec_ctrl = data;
                if (!data)
                        break;
 
@@ -2915,32 +2854,39 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                svm->virt_spec_ctrl = data;
                break;
        case MSR_STAR:
-               svm->vmcb->save.star = data;
+               svm->vmcb01.ptr->save.star = data;
                break;
 #ifdef CONFIG_X86_64
        case MSR_LSTAR:
-               svm->vmcb->save.lstar = data;
+               svm->vmcb01.ptr->save.lstar = data;
                break;
        case MSR_CSTAR:
-               svm->vmcb->save.cstar = data;
+               svm->vmcb01.ptr->save.cstar = data;
                break;
        case MSR_KERNEL_GS_BASE:
-               svm->vmcb->save.kernel_gs_base = data;
+               svm->vmcb01.ptr->save.kernel_gs_base = data;
                break;
        case MSR_SYSCALL_MASK:
-               svm->vmcb->save.sfmask = data;
+               svm->vmcb01.ptr->save.sfmask = data;
                break;
 #endif
        case MSR_IA32_SYSENTER_CS:
-               svm->vmcb->save.sysenter_cs = data;
+               svm->vmcb01.ptr->save.sysenter_cs = data;
                break;
        case MSR_IA32_SYSENTER_EIP:
-               svm->sysenter_eip = data;
-               svm->vmcb->save.sysenter_eip = data;
+               svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
+               /*
+                * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
+                * when we spoof an Intel vendor ID (for cross vendor migration).
+                * In this case we use this intercept to track the high
+                * 32 bit part of these msrs to support Intel's
+                * implementation of SYSENTER/SYSEXIT.
+                */
+               svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
                break;
        case MSR_IA32_SYSENTER_ESP:
-               svm->sysenter_esp = data;
-               svm->vmcb->save.sysenter_esp = data;
+               svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
+               svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
                break;
        case MSR_TSC_AUX:
                if (!boot_cpu_has(X86_FEATURE_RDTSCP))
@@ -3006,38 +2952,32 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
        return 0;
 }
 
-static int wrmsr_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_wrmsr(&svm->vcpu);
-}
-
-static int msr_interception(struct vcpu_svm *svm)
+static int msr_interception(struct kvm_vcpu *vcpu)
 {
-       if (svm->vmcb->control.exit_info_1)
-               return wrmsr_interception(svm);
+       if (to_svm(vcpu)->vmcb->control.exit_info_1)
+               return kvm_emulate_wrmsr(vcpu);
        else
-               return rdmsr_interception(svm);
+               return kvm_emulate_rdmsr(vcpu);
 }
 
-static int interrupt_window_interception(struct vcpu_svm *svm)
+static int interrupt_window_interception(struct kvm_vcpu *vcpu)
 {
-       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
-       svm_clear_vintr(svm);
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
+       svm_clear_vintr(to_svm(vcpu));
 
        /*
         * For AVIC, the only reason to end up here is ExtINTs.
         * In this case AVIC was temporarily disabled for
         * requesting the IRQ window and we have to re-enable it.
         */
-       svm_toggle_avic_for_irq_window(&svm->vcpu, true);
+       svm_toggle_avic_for_irq_window(vcpu, true);
 
-       ++svm->vcpu.stat.irq_window_exits;
+       ++vcpu->stat.irq_window_exits;
        return 1;
 }
 
-static int pause_interception(struct vcpu_svm *svm)
+static int pause_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
        bool in_kernel;
 
        /*
@@ -3045,35 +2985,18 @@ static int pause_interception(struct vcpu_svm *svm)
         * vcpu->arch.preempted_in_kernel can never be true.  Just
         * set in_kernel to false as well.
         */
-       in_kernel = !sev_es_guest(svm->vcpu.kvm) && svm_get_cpl(vcpu) == 0;
+       in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
 
        if (!kvm_pause_in_guest(vcpu->kvm))
                grow_ple_window(vcpu);
 
        kvm_vcpu_on_spin(vcpu, in_kernel);
-       return 1;
-}
-
-static int nop_interception(struct vcpu_svm *svm)
-{
-       return kvm_skip_emulated_instruction(&(svm->vcpu));
+       return kvm_skip_emulated_instruction(vcpu);
 }
 
-static int monitor_interception(struct vcpu_svm *svm)
+static int invpcid_interception(struct kvm_vcpu *vcpu)
 {
-       printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
-       return nop_interception(svm);
-}
-
-static int mwait_interception(struct vcpu_svm *svm)
-{
-       printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
-       return nop_interception(svm);
-}
-
-static int invpcid_interception(struct vcpu_svm *svm)
-{
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct vcpu_svm *svm = to_svm(vcpu);
        unsigned long type;
        gva_t gva;
 
@@ -3098,7 +3021,7 @@ static int invpcid_interception(struct vcpu_svm *svm)
        return kvm_handle_invpcid(vcpu, type, gva);
 }
 
-static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
+static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [SVM_EXIT_READ_CR0]                     = cr_interception,
        [SVM_EXIT_READ_CR3]                     = cr_interception,
        [SVM_EXIT_READ_CR4]                     = cr_interception,
@@ -3133,15 +3056,15 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
        [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
        [SVM_EXIT_INTR]                         = intr_interception,
        [SVM_EXIT_NMI]                          = nmi_interception,
-       [SVM_EXIT_SMI]                          = nop_on_interception,
-       [SVM_EXIT_INIT]                         = nop_on_interception,
+       [SVM_EXIT_SMI]                          = kvm_emulate_as_nop,
+       [SVM_EXIT_INIT]                         = kvm_emulate_as_nop,
        [SVM_EXIT_VINTR]                        = interrupt_window_interception,
-       [SVM_EXIT_RDPMC]                        = rdpmc_interception,
-       [SVM_EXIT_CPUID]                        = cpuid_interception,
+       [SVM_EXIT_RDPMC]                        = kvm_emulate_rdpmc,
+       [SVM_EXIT_CPUID]                        = kvm_emulate_cpuid,
        [SVM_EXIT_IRET]                         = iret_interception,
-       [SVM_EXIT_INVD]                         = invd_interception,
+       [SVM_EXIT_INVD]                         = kvm_emulate_invd,
        [SVM_EXIT_PAUSE]                        = pause_interception,
-       [SVM_EXIT_HLT]                          = halt_interception,
+       [SVM_EXIT_HLT]                          = kvm_emulate_halt,
        [SVM_EXIT_INVLPG]                       = invlpg_interception,
        [SVM_EXIT_INVLPGA]                      = invlpga_interception,
        [SVM_EXIT_IOIO]                         = io_interception,
@@ -3149,17 +3072,17 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
        [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
        [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
        [SVM_EXIT_VMRUN]                        = vmrun_interception,
-       [SVM_EXIT_VMMCALL]                      = vmmcall_interception,
+       [SVM_EXIT_VMMCALL]                      = kvm_emulate_hypercall,
        [SVM_EXIT_VMLOAD]                       = vmload_interception,
        [SVM_EXIT_VMSAVE]                       = vmsave_interception,
        [SVM_EXIT_STGI]                         = stgi_interception,
        [SVM_EXIT_CLGI]                         = clgi_interception,
        [SVM_EXIT_SKINIT]                       = skinit_interception,
-       [SVM_EXIT_WBINVD]                       = wbinvd_interception,
-       [SVM_EXIT_MONITOR]                      = monitor_interception,
-       [SVM_EXIT_MWAIT]                        = mwait_interception,
-       [SVM_EXIT_XSETBV]                       = xsetbv_interception,
-       [SVM_EXIT_RDPRU]                        = rdpru_interception,
+       [SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
+       [SVM_EXIT_MONITOR]                      = kvm_emulate_monitor,
+       [SVM_EXIT_MWAIT]                        = kvm_emulate_mwait,
+       [SVM_EXIT_XSETBV]                       = kvm_emulate_xsetbv,
+       [SVM_EXIT_RDPRU]                        = kvm_handle_invalid_op,
        [SVM_EXIT_EFER_WRITE_TRAP]              = efer_trap,
        [SVM_EXIT_CR0_WRITE_TRAP]               = cr_trap,
        [SVM_EXIT_CR4_WRITE_TRAP]               = cr_trap,
@@ -3177,6 +3100,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
        struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb_control_area *control = &svm->vmcb->control;
        struct vmcb_save_area *save = &svm->vmcb->save;
+       struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
 
        if (!dump_invalid_vmcb) {
                pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
@@ -3239,28 +3163,28 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
               save->ds.limit, save->ds.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "fs:",
-              save->fs.selector, save->fs.attrib,
-              save->fs.limit, save->fs.base);
+              save01->fs.selector, save01->fs.attrib,
+              save01->fs.limit, save01->fs.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "gs:",
-              save->gs.selector, save->gs.attrib,
-              save->gs.limit, save->gs.base);
+              save01->gs.selector, save01->gs.attrib,
+              save01->gs.limit, save01->gs.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "gdtr:",
               save->gdtr.selector, save->gdtr.attrib,
               save->gdtr.limit, save->gdtr.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "ldtr:",
-              save->ldtr.selector, save->ldtr.attrib,
-              save->ldtr.limit, save->ldtr.base);
+              save01->ldtr.selector, save01->ldtr.attrib,
+              save01->ldtr.limit, save01->ldtr.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "idtr:",
               save->idtr.selector, save->idtr.attrib,
               save->idtr.limit, save->idtr.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "tr:",
-              save->tr.selector, save->tr.attrib,
-              save->tr.limit, save->tr.base);
+              save01->tr.selector, save01->tr.attrib,
+              save01->tr.limit, save01->tr.base);
        pr_err("cpl:            %d                efer:         %016llx\n",
                save->cpl, save->efer);
        pr_err("%-15s %016llx %-13s %016llx\n",
@@ -3274,15 +3198,15 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
        pr_err("%-15s %016llx %-13s %016llx\n",
               "rsp:", save->rsp, "rax:", save->rax);
        pr_err("%-15s %016llx %-13s %016llx\n",
-              "star:", save->star, "lstar:", save->lstar);
+              "star:", save01->star, "lstar:", save01->lstar);
        pr_err("%-15s %016llx %-13s %016llx\n",
-              "cstar:", save->cstar, "sfmask:", save->sfmask);
+              "cstar:", save01->cstar, "sfmask:", save01->sfmask);
        pr_err("%-15s %016llx %-13s %016llx\n",
-              "kernel_gs_base:", save->kernel_gs_base,
-              "sysenter_cs:", save->sysenter_cs);
+              "kernel_gs_base:", save01->kernel_gs_base,
+              "sysenter_cs:", save01->sysenter_cs);
        pr_err("%-15s %016llx %-13s %016llx\n",
-              "sysenter_esp:", save->sysenter_esp,
-              "sysenter_eip:", save->sysenter_eip);
+              "sysenter_esp:", save01->sysenter_esp,
+              "sysenter_eip:", save01->sysenter_eip);
        pr_err("%-15s %016llx %-13s %016llx\n",
               "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
        pr_err("%-15s %016llx %-13s %016llx\n",
@@ -3309,24 +3233,24 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
        return -EINVAL;
 }
 
-int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code)
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
 {
-       if (svm_handle_invalid_exit(&svm->vcpu, exit_code))
+       if (svm_handle_invalid_exit(vcpu, exit_code))
                return 0;
 
 #ifdef CONFIG_RETPOLINE
        if (exit_code == SVM_EXIT_MSR)
-               return msr_interception(svm);
+               return msr_interception(vcpu);
        else if (exit_code == SVM_EXIT_VINTR)
-               return interrupt_window_interception(svm);
+               return interrupt_window_interception(vcpu);
        else if (exit_code == SVM_EXIT_INTR)
-               return intr_interception(svm);
+               return intr_interception(vcpu);
        else if (exit_code == SVM_EXIT_HLT)
-               return halt_interception(svm);
+               return kvm_emulate_halt(vcpu);
        else if (exit_code == SVM_EXIT_NPF)
-               return npf_interception(svm);
+               return npf_interception(vcpu);
 #endif
-       return svm_exit_handlers[exit_code](svm);
+       return svm_exit_handlers[exit_code](vcpu);
 }
 
 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
@@ -3395,7 +3319,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
        if (exit_fastpath != EXIT_FASTPATH_NONE)
                return 1;
 
-       return svm_invoke_exit_handler(svm, exit_code);
+       return svm_invoke_exit_handler(vcpu, exit_code);
 }
 
 static void reload_tss(struct kvm_vcpu *vcpu)
@@ -3406,15 +3330,27 @@ static void reload_tss(struct kvm_vcpu *vcpu)
        load_TR_desc();
 }
 
-static void pre_svm_run(struct vcpu_svm *svm)
+static void pre_svm_run(struct kvm_vcpu *vcpu)
 {
-       struct svm_cpu_data *sd = per_cpu(svm_data, svm->vcpu.cpu);
+       struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       /*
+        * If the previous vmrun of the vmcb occurred on a different physical
+        * cpu, then mark the vmcb dirty and assign a new asid.  Hardware's
+        * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
+        */
+       if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
+               svm->current_vmcb->asid_generation = 0;
+               vmcb_mark_all_dirty(svm->vmcb);
+               svm->current_vmcb->cpu = vcpu->cpu;
+        }
 
-       if (sev_guest(svm->vcpu.kvm))
-               return pre_sev_run(svm, svm->vcpu.cpu);
+       if (sev_guest(vcpu->kvm))
+               return pre_sev_run(svm, vcpu->cpu);
 
        /* FIXME: handle wraparound of asid_generation */
-       if (svm->asid_generation != sd->asid_generation)
+       if (svm->current_vmcb->asid_generation != sd->asid_generation)
                new_asid(svm, sd);
 }
 
@@ -3424,7 +3360,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
 
        svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
        vcpu->arch.hflags |= HF_NMI_MASK;
-       if (!sev_es_guest(svm->vcpu.kvm))
+       if (!sev_es_guest(vcpu->kvm))
                svm_set_intercept(svm, INTERCEPT_IRET);
        ++vcpu->stat.nmi_injections;
 }
@@ -3478,7 +3414,7 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
                return false;
 
        ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
-             (svm->vcpu.arch.hflags & HF_NMI_MASK);
+             (vcpu->arch.hflags & HF_NMI_MASK);
 
        return ret;
 }
@@ -3498,9 +3434,7 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
 
 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
 {
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
+       return !!(vcpu->arch.hflags & HF_NMI_MASK);
 }
 
 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
@@ -3508,12 +3442,12 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
        struct vcpu_svm *svm = to_svm(vcpu);
 
        if (masked) {
-               svm->vcpu.arch.hflags |= HF_NMI_MASK;
-               if (!sev_es_guest(svm->vcpu.kvm))
+               vcpu->arch.hflags |= HF_NMI_MASK;
+               if (!sev_es_guest(vcpu->kvm))
                        svm_set_intercept(svm, INTERCEPT_IRET);
        } else {
-               svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
-               if (!sev_es_guest(svm->vcpu.kvm))
+               vcpu->arch.hflags &= ~HF_NMI_MASK;
+               if (!sev_es_guest(vcpu->kvm))
                        svm_clr_intercept(svm, INTERCEPT_IRET);
        }
 }
@@ -3526,7 +3460,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
        if (!gif_set(svm))
                return true;
 
-       if (sev_es_guest(svm->vcpu.kvm)) {
+       if (sev_es_guest(vcpu->kvm)) {
                /*
                 * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask
                 * bit to determine the state of the IF flag.
@@ -3536,7 +3470,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
        } else if (is_guest_mode(vcpu)) {
                /* As long as interrupts are being delivered...  */
                if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
-                   ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF)
+                   ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
                    : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
                        return true;
 
@@ -3595,8 +3529,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
-           == HF_NMI_MASK)
+       if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
                return; /* IRET will cause a vm exit */
 
        if (!gif_set(svm)) {
@@ -3638,7 +3571,7 @@ void svm_flush_tlb(struct kvm_vcpu *vcpu)
        if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
        else
-               svm->asid_generation--;
+               svm->current_vmcb->asid_generation--;
 }
 
 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
@@ -3675,8 +3608,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
        svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
 }
 
-static void svm_complete_interrupts(struct vcpu_svm *svm)
+static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        u8 vector;
        int type;
        u32 exitintinfo = svm->vmcb->control.exit_int_info;
@@ -3688,28 +3622,28 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
         * If we've made progress since setting HF_IRET_MASK, we've
         * executed an IRET and can allow NMI injection.
         */
-       if ((svm->vcpu.arch.hflags & HF_IRET_MASK) &&
-           (sev_es_guest(svm->vcpu.kvm) ||
-            kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip)) {
-               svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
-               kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+       if ((vcpu->arch.hflags & HF_IRET_MASK) &&
+           (sev_es_guest(vcpu->kvm) ||
+            kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
+               vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
        }
 
-       svm->vcpu.arch.nmi_injected = false;
-       kvm_clear_exception_queue(&svm->vcpu);
-       kvm_clear_interrupt_queue(&svm->vcpu);
+       vcpu->arch.nmi_injected = false;
+       kvm_clear_exception_queue(vcpu);
+       kvm_clear_interrupt_queue(vcpu);
 
        if (!(exitintinfo & SVM_EXITINTINFO_VALID))
                return;
 
-       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
 
        vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
        type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
 
        switch (type) {
        case SVM_EXITINTINFO_TYPE_NMI:
-               svm->vcpu.arch.nmi_injected = true;
+               vcpu->arch.nmi_injected = true;
                break;
        case SVM_EXITINTINFO_TYPE_EXEPT:
                /*
@@ -3725,21 +3659,20 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
                 */
                if (kvm_exception_is_soft(vector)) {
                        if (vector == BP_VECTOR && int3_injected &&
-                           kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
-                               kvm_rip_write(&svm->vcpu,
-                                             kvm_rip_read(&svm->vcpu) -
-                                             int3_injected);
+                           kvm_is_linear_rip(vcpu, svm->int3_rip))
+                               kvm_rip_write(vcpu,
+                                             kvm_rip_read(vcpu) - int3_injected);
                        break;
                }
                if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
                        u32 err = svm->vmcb->control.exit_int_info_err;
-                       kvm_requeue_exception_e(&svm->vcpu, vector, err);
+                       kvm_requeue_exception_e(vcpu, vector, err);
 
                } else
-                       kvm_requeue_exception(&svm->vcpu, vector);
+                       kvm_requeue_exception(vcpu, vector);
                break;
        case SVM_EXITINTINFO_TYPE_INTR:
-               kvm_queue_interrupt(&svm->vcpu, vector, false);
+               kvm_queue_interrupt(vcpu, vector, false);
                break;
        default:
                break;
@@ -3754,7 +3687,7 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
        control->exit_int_info = control->event_inj;
        control->exit_int_info_err = control->event_inj_err;
        control->event_inj = 0;
-       svm_complete_interrupts(svm);
+       svm_complete_interrupts(vcpu);
 }
 
 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
@@ -3766,9 +3699,11 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
        return EXIT_FASTPATH_NONE;
 }
 
-static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
-                                       struct vcpu_svm *svm)
+static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
+       unsigned long vmcb_pa = svm->current_vmcb->pa;
+
        /*
         * VMENTER enables interrupts (host state), but the kernel state is
         * interrupts disabled when this is invoked. Also tell RCU about
@@ -3789,12 +3724,20 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
        guest_enter_irqoff();
        lockdep_hardirqs_on(CALLER_ADDR0);
 
-       if (sev_es_guest(svm->vcpu.kvm)) {
-               __svm_sev_es_vcpu_run(svm->vmcb_pa);
+       if (sev_es_guest(vcpu->kvm)) {
+               __svm_sev_es_vcpu_run(vmcb_pa);
        } else {
                struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
 
-               __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
+               /*
+                * Use a single vmcb (vmcb01 because it's always valid) for
+                * context switching guest state via VMLOAD/VMSAVE, that way
+                * the state doesn't need to be copied between vmcb01 and
+                * vmcb02 when switching vmcbs for nested virtualization.
+                */
+               vmload(svm->vmcb01.pa);
+               __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs);
+               vmsave(svm->vmcb01.pa);
 
                vmload(__sme_page_pa(sd->save_area));
        }
@@ -3845,7 +3788,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
                smp_send_reschedule(vcpu->cpu);
        }
 
-       pre_svm_run(svm);
+       pre_svm_run(vcpu);
 
        sync_lapic_to_cr8(vcpu);
 
@@ -3859,7 +3802,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
         * Run with all-zero DR6 unless needed, so that we can get the exact cause
         * of a #DB.
         */
-       if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+       if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
                svm_set_dr6(svm, vcpu->arch.dr6);
        else
                svm_set_dr6(svm, DR6_ACTIVE_LOW);
@@ -3875,9 +3818,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
         * is no need to worry about the conditional branch over the wrmsr
         * being speculatively taken.
         */
-       x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
+       if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+               x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
 
-       svm_vcpu_enter_exit(vcpu, svm);
+       svm_vcpu_enter_exit(vcpu);
 
        /*
         * We do not use IBRS in the kernel. If this vCPU has used the
@@ -3894,15 +3838,17 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
         * If the L02 MSR bitmap does not intercept the MSR, then we need to
         * save it.
         */
-       if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+       if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
+           unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
                svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
-       if (!sev_es_guest(svm->vcpu.kvm))
+       if (!sev_es_guest(vcpu->kvm))
                reload_tss(vcpu);
 
-       x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
+       if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+               x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
 
-       if (!sev_es_guest(svm->vcpu.kvm)) {
+       if (!sev_es_guest(vcpu->kvm)) {
                vcpu->arch.cr2 = svm->vmcb->save.cr2;
                vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
                vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
@@ -3910,7 +3856,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
        }
 
        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
-               kvm_before_interrupt(&svm->vcpu);
+               kvm_before_interrupt(vcpu);
 
        kvm_load_host_xsave_state(vcpu);
        stgi();
@@ -3918,13 +3864,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
        /* Any pending NMI will happen here */
 
        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
-               kvm_after_interrupt(&svm->vcpu);
+               kvm_after_interrupt(vcpu);
 
        sync_cr8_to_lapic(vcpu);
 
        svm->next_rip = 0;
-       if (is_guest_mode(&svm->vcpu)) {
-               sync_nested_vmcb_control(svm);
+       if (is_guest_mode(vcpu)) {
+               nested_sync_control_from_vmcb02(svm);
                svm->nested.nested_run_pending = 0;
        }
 
@@ -3933,7 +3879,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
 
        /* if exit due to PF check for async PF */
        if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
-               svm->vcpu.arch.apf.host_apf_flags =
+               vcpu->arch.apf.host_apf_flags =
                        kvm_read_and_reset_apf_flags();
 
        if (npt_enabled) {
@@ -3947,9 +3893,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
         */
        if (unlikely(svm->vmcb->control.exit_code ==
                     SVM_EXIT_EXCP_BASE + MC_VECTOR))
-               svm_handle_mce(svm);
+               svm_handle_mce(vcpu);
 
-       svm_complete_interrupts(svm);
+       svm_complete_interrupts(vcpu);
 
        if (is_guest_mode(vcpu))
                return EXIT_FASTPATH_NONE;
@@ -3957,21 +3903,26 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
        return svm_exit_handlers_fastpath(vcpu);
 }
 
-static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root,
+static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
                             int root_level)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
        unsigned long cr3;
 
-       cr3 = __sme_set(root);
        if (npt_enabled) {
-               svm->vmcb->control.nested_cr3 = cr3;
+               svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
                vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
 
                /* Loading L2's CR3 is handled by enter_svm_guest_mode.  */
                if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
                        return;
                cr3 = vcpu->arch.cr3;
+       } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
+               cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
+       } else {
+               /* PCID in the guest should be impossible with a 32-bit MMU. */
+               WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
+               cr3 = root_hpa;
        }
 
        svm->vmcb->save.cr3 = cr3;
@@ -4048,7 +3999,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 
        /* Update nrips enabled cache */
        svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
-                            guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
+                            guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
 
        /* Check again if INVPCID interception if required */
        svm_check_invpcid(svm);
@@ -4060,24 +4011,50 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
                        vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
        }
 
-       if (!kvm_vcpu_apicv_active(vcpu))
-               return;
+       if (kvm_vcpu_apicv_active(vcpu)) {
+               /*
+                * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
+                * is exposed to the guest, disable AVIC.
+                */
+               if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
+                       kvm_request_apicv_update(vcpu->kvm, false,
+                                                APICV_INHIBIT_REASON_X2APIC);
 
-       /*
-        * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
-        * is exposed to the guest, disable AVIC.
-        */
-       if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
-               kvm_request_apicv_update(vcpu->kvm, false,
-                                        APICV_INHIBIT_REASON_X2APIC);
+               /*
+                * Currently, AVIC does not work with nested virtualization.
+                * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
+                */
+               if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+                       kvm_request_apicv_update(vcpu->kvm, false,
+                                                APICV_INHIBIT_REASON_NESTED);
+       }
 
-       /*
-        * Currently, AVIC does not work with nested virtualization.
-        * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
-        */
-       if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
-               kvm_request_apicv_update(vcpu->kvm, false,
-                                        APICV_INHIBIT_REASON_NESTED);
+       if (guest_cpuid_is_intel(vcpu)) {
+               /*
+                * We must intercept SYSENTER_EIP and SYSENTER_ESP
+                * accesses because the processor only stores 32 bits.
+                * For the same reason we cannot use virtual VMLOAD/VMSAVE.
+                */
+               svm_set_intercept(svm, INTERCEPT_VMLOAD);
+               svm_set_intercept(svm, INTERCEPT_VMSAVE);
+               svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
+               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
+               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
+       } else {
+               /*
+                * If hardware supports Virtual VMLOAD VMSAVE then enable it
+                * in VMCB and clear intercepts to avoid #VMEXIT.
+                */
+               if (vls) {
+                       svm_clr_intercept(svm, INTERCEPT_VMLOAD);
+                       svm_clr_intercept(svm, INTERCEPT_VMSAVE);
+                       svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+               }
+               /* No need to intercept these MSRs */
+               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
+               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
+       }
 }
 
 static bool svm_has_wbinvd_exit(void)
@@ -4349,15 +4326,15 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
                        if (!(saved_efer & EFER_SVME))
                                return 1;
 
-                       if (kvm_vcpu_map(&svm->vcpu,
+                       if (kvm_vcpu_map(vcpu,
                                         gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
                                return 1;
 
                        if (svm_allocate_nested(svm))
                                return 1;
 
-                       ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva);
-                       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+                       ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, map.hva);
+                       kvm_vcpu_unmap(vcpu, &map, true);
                }
        }
 
@@ -4612,6 +4589,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
        .mem_enc_reg_region = svm_register_enc_region,
        .mem_enc_unreg_region = svm_unregister_enc_region,
 
+       .vm_copy_enc_context_from = svm_vm_copy_asid_from,
+
        .can_emulate_instruction = svm_can_emulate_instruction,
 
        .apic_init_signal_blocked = svm_apic_init_signal_blocked,
index 9806aae..d620619 100644 (file)
@@ -28,7 +28,10 @@ static const u32 host_save_user_msrs[] = {
 };
 #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
 
-#define MAX_DIRECT_ACCESS_MSRS 18
+#define        IOPM_SIZE PAGE_SIZE * 3
+#define        MSRPM_SIZE PAGE_SIZE * 2
+
+#define MAX_DIRECT_ACCESS_MSRS 20
 #define MSRPM_OFFSETS  16
 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 extern bool npt_enabled;
@@ -65,6 +68,7 @@ struct kvm_sev_info {
        unsigned long pages_locked; /* Number of pages locked */
        struct list_head regions_list;  /* List of registered regions */
        u64 ap_jump_table;      /* SEV-ES AP Jump Table address */
+       struct kvm *enc_context_owner; /* Owner of copied encryption context */
        struct misc_cg *misc_cg; /* For misc cgroup accounting */
 };
 
@@ -82,11 +86,19 @@ struct kvm_svm {
 
 struct kvm_vcpu;
 
+struct kvm_vmcb_info {
+       struct vmcb *ptr;
+       unsigned long pa;
+       int cpu;
+       uint64_t asid_generation;
+};
+
 struct svm_nested_state {
-       struct vmcb *hsave;
+       struct kvm_vmcb_info vmcb02;
        u64 hsave_msr;
        u64 vm_cr_msr;
        u64 vmcb12_gpa;
+       u64 last_vmcb12_gpa;
 
        /* These are the merged vectors */
        u32 *msrpm;
@@ -103,13 +115,14 @@ struct svm_nested_state {
 
 struct vcpu_svm {
        struct kvm_vcpu vcpu;
+       /* vmcb always points at current_vmcb->ptr, it's purely a shorthand. */
        struct vmcb *vmcb;
-       unsigned long vmcb_pa;
+       struct kvm_vmcb_info vmcb01;
+       struct kvm_vmcb_info *current_vmcb;
        struct svm_cpu_data *svm_data;
        u32 asid;
-       uint64_t asid_generation;
-       uint64_t sysenter_esp;
-       uint64_t sysenter_eip;
+       u32 sysenter_esp_hi;
+       u32 sysenter_eip_hi;
        uint64_t tsc_aux;
 
        u64 msr_decfg;
@@ -240,17 +253,14 @@ static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit)
        vmcb->control.clean &= ~(1 << bit);
 }
 
-static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
+static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit)
 {
-       return container_of(vcpu, struct vcpu_svm, vcpu);
+        return !test_bit(bit, (unsigned long *)&vmcb->control.clean);
 }
 
-static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
+static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 {
-       if (is_guest_mode(&svm->vcpu))
-               return svm->nested.hsave;
-       else
-               return svm->vmcb;
+       return container_of(vcpu, struct vcpu_svm, vcpu);
 }
 
 static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
@@ -273,7 +283,7 @@ static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit)
 
 static inline void set_dr_intercepts(struct vcpu_svm *svm)
 {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
 
        if (!sev_es_guest(svm->vcpu.kvm)) {
                vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
@@ -300,7 +310,7 @@ static inline void set_dr_intercepts(struct vcpu_svm *svm)
 
 static inline void clr_dr_intercepts(struct vcpu_svm *svm)
 {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
 
        vmcb->control.intercepts[INTERCEPT_DR] = 0;
 
@@ -315,7 +325,7 @@ static inline void clr_dr_intercepts(struct vcpu_svm *svm)
 
 static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
 {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
 
        WARN_ON_ONCE(bit >= 32);
        vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
@@ -325,7 +335,7 @@ static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
 
 static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
 {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
 
        WARN_ON_ONCE(bit >= 32);
        vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
@@ -335,7 +345,7 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
 
 static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
 {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
 
        vmcb_set_intercept(&vmcb->control, bit);
 
@@ -344,7 +354,7 @@ static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
 
 static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit)
 {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
 
        vmcb_clr_intercept(&vmcb->control, bit);
 
@@ -406,7 +416,7 @@ bool svm_smi_blocked(struct kvm_vcpu *vcpu);
 bool svm_nmi_blocked(struct kvm_vcpu *vcpu);
 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu);
 void svm_set_gif(struct vcpu_svm *svm, bool value);
-int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code);
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code);
 void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
                          int read, int write);
 
@@ -438,20 +448,30 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
        return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
 }
 
-int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
-                        struct vmcb *nested_vmcb);
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, struct vmcb *vmcb12);
 void svm_leave_nested(struct vcpu_svm *svm);
 void svm_free_nested(struct vcpu_svm *svm);
 int svm_allocate_nested(struct vcpu_svm *svm);
-int nested_svm_vmrun(struct vcpu_svm *svm);
+int nested_svm_vmrun(struct kvm_vcpu *vcpu);
 void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb);
 int nested_svm_vmexit(struct vcpu_svm *svm);
+
+static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code)
+{
+       svm->vmcb->control.exit_code   = exit_code;
+       svm->vmcb->control.exit_info_1 = 0;
+       svm->vmcb->control.exit_info_2 = 0;
+       return nested_svm_vmexit(svm);
+}
+
 int nested_svm_exit_handled(struct vcpu_svm *svm);
-int nested_svm_check_permissions(struct vcpu_svm *svm);
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu);
 int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
                               bool has_error_code, u32 error_code);
 int nested_svm_exit_special(struct vcpu_svm *svm);
-void sync_nested_vmcb_control(struct vcpu_svm *svm);
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm);
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm);
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb);
 
 extern struct kvm_x86_nested_ops svm_nested_ops;
 
@@ -492,8 +512,8 @@ void avic_vm_destroy(struct kvm *kvm);
 int avic_vm_init(struct kvm *kvm);
 void avic_init_vmcb(struct vcpu_svm *svm);
 void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate);
-int avic_incomplete_ipi_interception(struct vcpu_svm *svm);
-int avic_unaccelerated_access_interception(struct vcpu_svm *svm);
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
 int avic_init_vcpu(struct vcpu_svm *svm);
 void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 void avic_vcpu_put(struct kvm_vcpu *vcpu);
@@ -562,11 +582,12 @@ int svm_register_enc_region(struct kvm *kvm,
                            struct kvm_enc_region *range);
 int svm_unregister_enc_region(struct kvm *kvm,
                              struct kvm_enc_region *range);
+int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd);
 void pre_sev_run(struct vcpu_svm *svm, int cpu);
 void __init sev_hardware_setup(void);
 void sev_hardware_teardown(void);
 void sev_free_vcpu(struct kvm_vcpu *vcpu);
-int sev_handle_vmgexit(struct vcpu_svm *svm);
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
 int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
 void sev_es_init_vmcb(struct vcpu_svm *svm);
 void sev_es_create_vcpu(struct vcpu_svm *svm);
index 6feb8c0..4fa17df 100644 (file)
@@ -79,28 +79,10 @@ SYM_FUNC_START(__svm_vcpu_run)
 
        /* Enter guest mode */
        sti
-1:     vmload %_ASM_AX
-       jmp 3f
-2:     cmpb $0, kvm_rebooting
-       jne 3f
-       ud2
-       _ASM_EXTABLE(1b, 2b)
 
-3:     vmrun %_ASM_AX
-       jmp 5f
-4:     cmpb $0, kvm_rebooting
-       jne 5f
-       ud2
-       _ASM_EXTABLE(3b, 4b)
+1:     vmrun %_ASM_AX
 
-5:     vmsave %_ASM_AX
-       jmp 7f
-6:     cmpb $0, kvm_rebooting
-       jne 7f
-       ud2
-       _ASM_EXTABLE(5b, 6b)
-7:
-       cli
+2:     cli
 
 #ifdef CONFIG_RETPOLINE
        /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
@@ -167,6 +149,13 @@ SYM_FUNC_START(__svm_vcpu_run)
 #endif
        pop %_ASM_BP
        ret
+
+3:     cmpb $0, kvm_rebooting
+       jne 2b
+       ud2
+
+       _ASM_EXTABLE(1b, 3b)
+
 SYM_FUNC_END(__svm_vcpu_run)
 
 /**
@@ -186,18 +175,15 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
 #endif
        push %_ASM_BX
 
-       /* Enter guest mode */
+       /* Move @vmcb to RAX. */
        mov %_ASM_ARG1, %_ASM_AX
+
+       /* Enter guest mode */
        sti
 
 1:     vmrun %_ASM_AX
-       jmp 3f
-2:     cmpb $0, kvm_rebooting
-       jne 3f
-       ud2
-       _ASM_EXTABLE(1b, 2b)
 
-3:     cli
+2:     cli
 
 #ifdef CONFIG_RETPOLINE
        /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
@@ -217,4 +203,11 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
 #endif
        pop %_ASM_BP
        ret
+
+3:     cmpb $0, kvm_rebooting
+       jne 2b
+       ud2
+
+       _ASM_EXTABLE(1b, 3b)
+
 SYM_FUNC_END(__svm_sev_es_vcpu_run)
index bcca0b8..8b11168 100644 (file)
@@ -11,6 +11,7 @@
 #include "mmu.h"
 #include "nested.h"
 #include "pmu.h"
+#include "sgx.h"
 #include "trace.h"
 #include "vmx.h"
 #include "x86.h"
@@ -21,13 +22,7 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
 static bool __read_mostly nested_early_check = 0;
 module_param(nested_early_check, bool, S_IRUGO);
 
-#define CC(consistency_check)                                          \
-({                                                                     \
-       bool failed = (consistency_check);                              \
-       if (failed)                                                     \
-               trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
-       failed;                                                         \
-})
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
 
 /*
  * Hyper-V requires all of these, so mark them as supported even though
@@ -2306,6 +2301,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
                if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
                    exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
 
+               if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
+                       vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
+
                secondary_exec_controls_set(vmx, exec_control);
        }
 
@@ -3453,6 +3451,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
        enum nested_evmptrld_status evmptrld_status;
 
+       ++vcpu->stat.nested_run;
+
        if (!nested_vmx_check_permission(vcpu))
                return 1;
 
@@ -3810,9 +3810,15 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 
        /*
         * Process any exceptions that are not debug traps before MTF.
+        *
+        * Note that only a pending nested run can block a pending exception.
+        * Otherwise an injected NMI/interrupt should either be
+        * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO,
+        * while delivering the pending exception.
         */
+
        if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
-               if (block_nested_events)
+               if (vmx->nested.nested_run_pending)
                        return -EBUSY;
                if (!nested_vmx_check_exception(vcpu, &exit_qual))
                        goto no_vmexit;
@@ -3829,7 +3835,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
        }
 
        if (vcpu->arch.exception.pending) {
-               if (block_nested_events)
+               if (vmx->nested.nested_run_pending)
                        return -EBUSY;
                if (!nested_vmx_check_exception(vcpu, &exit_qual))
                        goto no_vmexit;
@@ -4105,6 +4111,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 {
        /* update exit information fields: */
        vmcs12->vm_exit_reason = vm_exit_reason;
+       if (to_vmx(vcpu)->exit_reason.enclave_mode)
+               vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
        vmcs12->exit_qualification = exit_qualification;
        vmcs12->vm_exit_intr_info = exit_intr_info;
 
@@ -4422,6 +4430,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
        /* trying to cancel vmlaunch/vmresume is a bug */
        WARN_ON_ONCE(vmx->nested.nested_run_pending);
 
+       /* Similarly, triple faults in L2 should never escape. */
+       WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
+
        kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
 
        /* Service the TLB flush request for L2 before switching to L1. */
@@ -4558,6 +4569,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
        vmx->fail = 0;
 }
 
+static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
+{
+       nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
+}
+
 /*
  * Decode the memory-address operand of a vmx instruction, as recorded on an
  * exit caused by such an instruction (run by a guest hypervisor).
@@ -5479,16 +5495,11 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
                if (!nested_vmx_check_eptp(vcpu, new_eptp))
                        return 1;
 
-               kvm_mmu_unload(vcpu);
                mmu->ept_ad = accessed_dirty;
                mmu->mmu_role.base.ad_disabled = !accessed_dirty;
                vmcs12->ept_pointer = new_eptp;
-               /*
-                * TODO: Check what's the correct approach in case
-                * mmu reload fails. Currently, we just let the next
-                * reload potentially fail
-                */
-               kvm_mmu_reload(vcpu);
+
+               kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
        }
 
        return 0;
@@ -5705,6 +5716,21 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
        return false;
 }
 
+static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
+                                         struct vmcs12 *vmcs12)
+{
+       u32 encls_leaf;
+
+       if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
+           !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
+               return false;
+
+       encls_leaf = kvm_rax_read(vcpu);
+       if (encls_leaf > 62)
+               encls_leaf = 63;
+       return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
+}
+
 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
        struct vmcs12 *vmcs12, gpa_t bitmap)
 {
@@ -5801,9 +5827,6 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
        case EXIT_REASON_VMFUNC:
                /* VM functions are emulated through L2->L0 vmexits. */
                return true;
-       case EXIT_REASON_ENCLS:
-               /* SGX is never exposed to L1 */
-               return true;
        default:
                break;
        }
@@ -5927,6 +5950,8 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
        case EXIT_REASON_TPAUSE:
                return nested_cpu_has2(vmcs12,
                        SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
+       case EXIT_REASON_ENCLS:
+               return nested_vmx_exit_handled_encls(vcpu, vmcs12);
        default:
                return true;
        }
@@ -6502,6 +6527,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
                msrs->secondary_ctls_high |=
                        SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 
+       if (enable_sgx)
+               msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
+
        /* miscellaneous data */
        rdmsr(MSR_IA32_VMX_MISC,
                msrs->misc_low,
@@ -6599,6 +6627,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
 struct kvm_x86_nested_ops vmx_nested_ops = {
        .check_events = vmx_check_nested_events,
        .hv_timer_pending = nested_vmx_preemption_timer_pending,
+       .triple_fault = nested_vmx_triple_fault,
        .get_state = vmx_get_nested_state,
        .set_state = vmx_set_nested_state,
        .get_nested_state_pages = vmx_get_nested_state_pages,
index 197148d..184418b 100644 (file)
@@ -244,6 +244,11 @@ static inline bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
                PIN_BASED_EXT_INTR_MASK;
 }
 
+static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12)
+{
+       return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING);
+}
+
 /*
  * if fixed0[i] == 1: val[i] must be 1
  * if fixed1[i] == 0: val[i] must be 0
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
new file mode 100644 (file)
index 0000000..6693ebd
--- /dev/null
@@ -0,0 +1,502 @@
+// SPDX-License-Identifier: GPL-2.0
+/*  Copyright(c) 2021 Intel Corporation. */
+
+#include <asm/sgx.h>
+
+#include "cpuid.h"
+#include "kvm_cache_regs.h"
+#include "nested.h"
+#include "sgx.h"
+#include "vmx.h"
+#include "x86.h"
+
+bool __read_mostly enable_sgx = 1;
+module_param_named(sgx, enable_sgx, bool, 0444);
+
+/* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */
+static u64 sgx_pubkey_hash[4] __ro_after_init;
+
+/*
+ * ENCLS's memory operands use a fixed segment (DS) and a fixed
+ * address size based on the mode.  Related prefixes are ignored.
+ */
+static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset,
+                            int size, int alignment, gva_t *gva)
+{
+       struct kvm_segment s;
+       bool fault;
+
+       /* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */
+       *gva = offset;
+       if (!is_long_mode(vcpu)) {
+               vmx_get_segment(vcpu, &s, VCPU_SREG_DS);
+               *gva += s.base;
+       }
+
+       if (!IS_ALIGNED(*gva, alignment)) {
+               fault = true;
+       } else if (likely(is_long_mode(vcpu))) {
+               fault = is_noncanonical_address(*gva, vcpu);
+       } else {
+               *gva &= 0xffffffff;
+               fault = (s.unusable) ||
+                       (s.type != 2 && s.type != 3) ||
+                       (*gva > s.limit) ||
+                       ((s.base != 0 || s.limit != 0xffffffff) &&
+                       (((u64)*gva + size - 1) > s.limit + 1));
+       }
+       if (fault)
+               kvm_inject_gp(vcpu, 0);
+       return fault ? -EINVAL : 0;
+}
+
+static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr,
+                                        unsigned int size)
+{
+       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+       vcpu->run->internal.ndata = 2;
+       vcpu->run->internal.data[0] = addr;
+       vcpu->run->internal.data[1] = size;
+}
+
+static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data,
+                       unsigned int size)
+{
+       if (__copy_from_user(data, (void __user *)hva, size)) {
+               sgx_handle_emulation_failure(vcpu, hva, size);
+               return -EFAULT;
+       }
+
+       return 0;
+}
+
+static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write,
+                         gpa_t *gpa)
+{
+       struct x86_exception ex;
+
+       if (write)
+               *gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, &ex);
+       else
+               *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex);
+
+       if (*gpa == UNMAPPED_GVA) {
+               kvm_inject_emulated_page_fault(vcpu, &ex);
+               return -EFAULT;
+       }
+
+       return 0;
+}
+
+static int sgx_gpa_to_hva(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long *hva)
+{
+       *hva = kvm_vcpu_gfn_to_hva(vcpu, PFN_DOWN(gpa));
+       if (kvm_is_error_hva(*hva)) {
+               sgx_handle_emulation_failure(vcpu, gpa, 1);
+               return -EFAULT;
+       }
+
+       *hva |= gpa & ~PAGE_MASK;
+
+       return 0;
+}
+
+static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr)
+{
+       struct x86_exception ex;
+
+       /*
+        * A non-EPCM #PF indicates a bad userspace HVA.  This *should* check
+        * for PFEC.SGX and not assume any #PF on SGX2 originated in the EPC,
+        * but the error code isn't (yet) plumbed through the ENCLS helpers.
+        */
+       if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) {
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+               vcpu->run->internal.ndata = 0;
+               return 0;
+       }
+
+       /*
+        * If the guest thinks it's running on SGX2 hardware, inject an SGX
+        * #PF if the fault matches an EPCM fault signature (#GP on SGX1,
+        * #PF on SGX2).  The assumption is that EPCM faults are much more
+        * likely than a bad userspace address.
+        */
+       if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) &&
+           guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) {
+               memset(&ex, 0, sizeof(ex));
+               ex.vector = PF_VECTOR;
+               ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK |
+                               PFERR_SGX_MASK;
+               ex.address = gva;
+               ex.error_code_valid = true;
+               ex.nested_page_fault = false;
+               kvm_inject_page_fault(vcpu, &ex);
+       } else {
+               kvm_inject_gp(vcpu, 0);
+       }
+       return 1;
+}
+
+static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
+                                 struct sgx_pageinfo *pageinfo,
+                                 unsigned long secs_hva,
+                                 gva_t secs_gva)
+{
+       struct sgx_secs *contents = (struct sgx_secs *)pageinfo->contents;
+       struct kvm_cpuid_entry2 *sgx_12_0, *sgx_12_1;
+       u64 attributes, xfrm, size;
+       u32 miscselect;
+       u8 max_size_log2;
+       int trapnr, ret;
+
+       sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0);
+       sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1);
+       if (!sgx_12_0 || !sgx_12_1) {
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+               vcpu->run->internal.ndata = 0;
+               return 0;
+       }
+
+       miscselect = contents->miscselect;
+       attributes = contents->attributes;
+       xfrm = contents->xfrm;
+       size = contents->size;
+
+       /* Enforce restriction of access to the PROVISIONKEY. */
+       if (!vcpu->kvm->arch.sgx_provisioning_allowed &&
+           (attributes & SGX_ATTR_PROVISIONKEY)) {
+               if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY)
+                       pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n");
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
+       /* Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. */
+       if ((u32)miscselect & ~sgx_12_0->ebx ||
+           (u32)attributes & ~sgx_12_1->eax ||
+           (u32)(attributes >> 32) & ~sgx_12_1->ebx ||
+           (u32)xfrm & ~sgx_12_1->ecx ||
+           (u32)(xfrm >> 32) & ~sgx_12_1->edx) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
+       /* Enforce CPUID restriction on max enclave size. */
+       max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 :
+                                                           sgx_12_0->edx;
+       if (size >= BIT_ULL(max_size_log2))
+               kvm_inject_gp(vcpu, 0);
+
+       /*
+        * sgx_virt_ecreate() returns:
+        *  1) 0:       ECREATE was successful
+        *  2) -EFAULT: ECREATE was run but faulted, and trapnr was set to the
+        *              exception number.
+        *  3) -EINVAL: access_ok() on @secs_hva failed. This should never
+        *              happen as KVM checks host addresses at memslot creation.
+        *              sgx_virt_ecreate() has already warned in this case.
+        */
+       ret = sgx_virt_ecreate(pageinfo, (void __user *)secs_hva, &trapnr);
+       if (!ret)
+               return kvm_skip_emulated_instruction(vcpu);
+       if (ret == -EFAULT)
+               return sgx_inject_fault(vcpu, secs_gva, trapnr);
+
+       return ret;
+}
+
+static int handle_encls_ecreate(struct kvm_vcpu *vcpu)
+{
+       gva_t pageinfo_gva, secs_gva;
+       gva_t metadata_gva, contents_gva;
+       gpa_t metadata_gpa, contents_gpa, secs_gpa;
+       unsigned long metadata_hva, contents_hva, secs_hva;
+       struct sgx_pageinfo pageinfo;
+       struct sgx_secs *contents;
+       struct x86_exception ex;
+       int r;
+
+       if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 32, 32, &pageinfo_gva) ||
+           sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva))
+               return 1;
+
+       /*
+        * Copy the PAGEINFO to local memory, its pointers need to be
+        * translated, i.e. we need to do a deep copy/translate.
+        */
+       r = kvm_read_guest_virt(vcpu, pageinfo_gva, &pageinfo,
+                               sizeof(pageinfo), &ex);
+       if (r == X86EMUL_PROPAGATE_FAULT) {
+               kvm_inject_emulated_page_fault(vcpu, &ex);
+               return 1;
+       } else if (r != X86EMUL_CONTINUE) {
+               sgx_handle_emulation_failure(vcpu, pageinfo_gva,
+                                            sizeof(pageinfo));
+               return 0;
+       }
+
+       if (sgx_get_encls_gva(vcpu, pageinfo.metadata, 64, 64, &metadata_gva) ||
+           sgx_get_encls_gva(vcpu, pageinfo.contents, 4096, 4096,
+                             &contents_gva))
+               return 1;
+
+       /*
+        * Translate the SECINFO, SOURCE and SECS pointers from GVA to GPA.
+        * Resume the guest on failure to inject a #PF.
+        */
+       if (sgx_gva_to_gpa(vcpu, metadata_gva, false, &metadata_gpa) ||
+           sgx_gva_to_gpa(vcpu, contents_gva, false, &contents_gpa) ||
+           sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa))
+               return 1;
+
+       /*
+        * ...and then to HVA.  The order of accesses isn't architectural, i.e.
+        * KVM doesn't have to fully process one address at a time.  Exit to
+        * userspace if a GPA is invalid.
+        */
+       if (sgx_gpa_to_hva(vcpu, metadata_gpa, &metadata_hva) ||
+           sgx_gpa_to_hva(vcpu, contents_gpa, &contents_hva) ||
+           sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva))
+               return 0;
+
+       /*
+        * Copy contents into kernel memory to prevent TOCTOU attack. E.g. the
+        * guest could do ECREATE w/ SECS.SGX_ATTR_PROVISIONKEY=0, and
+        * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to
+        * enforce restriction of access to the PROVISIONKEY.
+        */
+       contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT);
+       if (!contents)
+               return -ENOMEM;
+
+       /* Exit to userspace if copying from a host userspace address fails. */
+       if (sgx_read_hva(vcpu, contents_hva, (void *)contents, PAGE_SIZE)) {
+               free_page((unsigned long)contents);
+               return 0;
+       }
+
+       pageinfo.metadata = metadata_hva;
+       pageinfo.contents = (u64)contents;
+
+       r = __handle_encls_ecreate(vcpu, &pageinfo, secs_hva, secs_gva);
+
+       free_page((unsigned long)contents);
+
+       return r;
+}
+
+static int handle_encls_einit(struct kvm_vcpu *vcpu)
+{
+       unsigned long sig_hva, secs_hva, token_hva, rflags;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       gva_t sig_gva, secs_gva, token_gva;
+       gpa_t sig_gpa, secs_gpa, token_gpa;
+       int ret, trapnr;
+
+       if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 1808, 4096, &sig_gva) ||
+           sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva) ||
+           sgx_get_encls_gva(vcpu, kvm_rdx_read(vcpu), 304, 512, &token_gva))
+               return 1;
+
+       /*
+        * Translate the SIGSTRUCT, SECS and TOKEN pointers from GVA to GPA.
+        * Resume the guest on failure to inject a #PF.
+        */
+       if (sgx_gva_to_gpa(vcpu, sig_gva, false, &sig_gpa) ||
+           sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa) ||
+           sgx_gva_to_gpa(vcpu, token_gva, false, &token_gpa))
+               return 1;
+
+       /*
+        * ...and then to HVA.  The order of accesses isn't architectural, i.e.
+        * KVM doesn't have to fully process one address at a time.  Exit to
+        * userspace if a GPA is invalid.  Note, all structures are aligned and
+        * cannot split pages.
+        */
+       if (sgx_gpa_to_hva(vcpu, sig_gpa, &sig_hva) ||
+           sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva) ||
+           sgx_gpa_to_hva(vcpu, token_gpa, &token_hva))
+               return 0;
+
+       ret = sgx_virt_einit((void __user *)sig_hva, (void __user *)token_hva,
+                            (void __user *)secs_hva,
+                            vmx->msr_ia32_sgxlepubkeyhash, &trapnr);
+
+       if (ret == -EFAULT)
+               return sgx_inject_fault(vcpu, secs_gva, trapnr);
+
+       /*
+        * sgx_virt_einit() returns -EINVAL when access_ok() fails on @sig_hva,
+        * @token_hva or @secs_hva. This should never happen as KVM checks host
+        * addresses at memslot creation. sgx_virt_einit() has already warned
+        * in this case, so just return.
+        */
+       if (ret < 0)
+               return ret;
+
+       rflags = vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF |
+                                         X86_EFLAGS_AF | X86_EFLAGS_SF |
+                                         X86_EFLAGS_OF);
+       if (ret)
+               rflags |= X86_EFLAGS_ZF;
+       else
+               rflags &= ~X86_EFLAGS_ZF;
+       vmx_set_rflags(vcpu, rflags);
+
+       kvm_rax_write(vcpu, ret);
+       return kvm_skip_emulated_instruction(vcpu);
+}
+
+static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf)
+{
+       if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+               return false;
+
+       if (leaf >= ECREATE && leaf <= ETRACK)
+               return guest_cpuid_has(vcpu, X86_FEATURE_SGX1);
+
+       if (leaf >= EAUG && leaf <= EMODT)
+               return guest_cpuid_has(vcpu, X86_FEATURE_SGX2);
+
+       return false;
+}
+
+static inline bool sgx_enabled_in_guest_bios(struct kvm_vcpu *vcpu)
+{
+       const u64 bits = FEAT_CTL_SGX_ENABLED | FEAT_CTL_LOCKED;
+
+       return (to_vmx(vcpu)->msr_ia32_feature_control & bits) == bits;
+}
+
+int handle_encls(struct kvm_vcpu *vcpu)
+{
+       u32 leaf = (u32)kvm_rax_read(vcpu);
+
+       if (!encls_leaf_enabled_in_guest(vcpu, leaf)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+       } else if (!sgx_enabled_in_guest_bios(vcpu)) {
+               kvm_inject_gp(vcpu, 0);
+       } else {
+               if (leaf == ECREATE)
+                       return handle_encls_ecreate(vcpu);
+               if (leaf == EINIT)
+                       return handle_encls_einit(vcpu);
+               WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf);
+               vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+               vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS;
+               return 0;
+       }
+       return 1;
+}
+
+void setup_default_sgx_lepubkeyhash(void)
+{
+       /*
+        * Use Intel's default value for Skylake hardware if Launch Control is
+        * not supported, i.e. Intel's hash is hardcoded into silicon, or if
+        * Launch Control is supported and enabled, i.e. mimic the reset value
+        * and let the guest write the MSRs at will.  If Launch Control is
+        * supported but disabled, then use the current MSR values as the hash
+        * MSRs exist but are read-only (locked and not writable).
+        */
+       if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) ||
+           rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) {
+               sgx_pubkey_hash[0] = 0xa6053e051270b7acULL;
+               sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL;
+               sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL;
+               sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL;
+       } else {
+               /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */
+               rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]);
+               rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]);
+               rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]);
+       }
+}
+
+void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash,
+              sizeof(sgx_pubkey_hash));
+}
+
+/*
+ * ECREATE must be intercepted to enforce MISCSELECT, ATTRIBUTES and XFRM
+ * restrictions if the guest's allowed-1 settings diverge from hardware.
+ */
+static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu)
+{
+       struct kvm_cpuid_entry2 *guest_cpuid;
+       u32 eax, ebx, ecx, edx;
+
+       if (!vcpu->kvm->arch.sgx_provisioning_allowed)
+               return true;
+
+       guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0);
+       if (!guest_cpuid)
+               return true;
+
+       cpuid_count(0x12, 0, &eax, &ebx, &ecx, &edx);
+       if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx)
+               return true;
+
+       guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1);
+       if (!guest_cpuid)
+               return true;
+
+       cpuid_count(0x12, 1, &eax, &ebx, &ecx, &edx);
+       if (guest_cpuid->eax != eax || guest_cpuid->ebx != ebx ||
+           guest_cpuid->ecx != ecx || guest_cpuid->edx != edx)
+               return true;
+
+       return false;
+}
+
+void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+       /*
+        * There is no software enable bit for SGX that is virtualized by
+        * hardware, e.g. there's no CR4.SGXE, so when SGX is disabled in the
+        * guest (either by the host or by the guest's BIOS) but enabled in the
+        * host, trap all ENCLS leafs and inject #UD/#GP as needed to emulate
+        * the expected system behavior for ENCLS.
+        */
+       u64 bitmap = -1ull;
+
+       /* Nothing to do if hardware doesn't support SGX */
+       if (!cpu_has_vmx_encls_vmexit())
+               return;
+
+       if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) &&
+           sgx_enabled_in_guest_bios(vcpu)) {
+               if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) {
+                       bitmap &= ~GENMASK_ULL(ETRACK, ECREATE);
+                       if (sgx_intercept_encls_ecreate(vcpu))
+                               bitmap |= (1 << ECREATE);
+               }
+
+               if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2))
+                       bitmap &= ~GENMASK_ULL(EMODT, EAUG);
+
+               /*
+                * Trap and execute EINIT if launch control is enabled in the
+                * host using the guest's values for launch control MSRs, even
+                * if the guest's values are fixed to hardware default values.
+                * The MSRs are not loaded/saved on VM-Enter/VM-Exit as writing
+                * the MSRs is extraordinarily expensive.
+                */
+               if (boot_cpu_has(X86_FEATURE_SGX_LC))
+                       bitmap |= (1 << EINIT);
+
+               if (!vmcs12 && is_guest_mode(vcpu))
+                       vmcs12 = get_vmcs12(vcpu);
+               if (vmcs12 && nested_cpu_has_encls_exit(vmcs12))
+                       bitmap |= vmcs12->encls_exiting_bitmap;
+       }
+       vmcs_write64(ENCLS_EXITING_BITMAP, bitmap);
+}
diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h
new file mode 100644 (file)
index 0000000..a400888
--- /dev/null
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_SGX_H
+#define __KVM_X86_SGX_H
+
+#include <linux/kvm_host.h>
+
+#include "capabilities.h"
+#include "vmx_ops.h"
+
+#ifdef CONFIG_X86_SGX_KVM
+extern bool __read_mostly enable_sgx;
+
+int handle_encls(struct kvm_vcpu *vcpu);
+
+void setup_default_sgx_lepubkeyhash(void);
+void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu);
+
+void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12);
+#else
+#define enable_sgx 0
+
+static inline void setup_default_sgx_lepubkeyhash(void) { }
+static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { }
+
+static inline void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu,
+                                         struct vmcs12 *vmcs12)
+{
+       /* Nothing to do if hardware doesn't support SGX */
+       if (cpu_has_vmx_encls_vmexit())
+               vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+}
+#endif
+
+#endif /* __KVM_X86_SGX_H */
index c8e51c0..034adb6 100644 (file)
@@ -50,6 +50,7 @@ const unsigned short vmcs_field_to_offset_table[] = {
        FIELD64(VMREAD_BITMAP, vmread_bitmap),
        FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
        FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
+       FIELD64(ENCLS_EXITING_BITMAP, encls_exiting_bitmap),
        FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
        FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
        FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
index 80232da..1349495 100644 (file)
@@ -69,7 +69,8 @@ struct __packed vmcs12 {
        u64 vm_function_control;
        u64 eptp_list_address;
        u64 pml_address;
-       u64 padding64[3]; /* room for future expansion */
+       u64 encls_exiting_bitmap;
+       u64 padding64[2]; /* room for future expansion */
        /*
         * To allow migration of L1 (complete with its L2 guests) between
         * machines of different natural widths (32 or 64 bit), we cannot have
@@ -256,6 +257,7 @@ static inline void vmx_check_vmcs12_offsets(void)
        CHECK_OFFSET(vm_function_control, 296);
        CHECK_OFFSET(eptp_list_address, 304);
        CHECK_OFFSET(pml_address, 312);
+       CHECK_OFFSET(encls_exiting_bitmap, 320);
        CHECK_OFFSET(cr0_guest_host_mask, 344);
        CHECK_OFFSET(cr4_guest_host_mask, 352);
        CHECK_OFFSET(cr0_read_shadow, 360);
index 32cf828..6501d66 100644 (file)
@@ -57,6 +57,7 @@
 #include "mmu.h"
 #include "nested.h"
 #include "pmu.h"
+#include "sgx.h"
 #include "trace.h"
 #include "vmcs.h"
 #include "vmcs12.h"
@@ -472,26 +473,6 @@ static const u32 vmx_uret_msrs_list[] = {
 static bool __read_mostly enlightened_vmcs = true;
 module_param(enlightened_vmcs, bool, 0444);
 
-/* check_ept_pointer() should be under protection of ept_pointer_lock. */
-static void check_ept_pointer_match(struct kvm *kvm)
-{
-       struct kvm_vcpu *vcpu;
-       u64 tmp_eptp = INVALID_PAGE;
-       int i;
-
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               if (!VALID_PAGE(tmp_eptp)) {
-                       tmp_eptp = to_vmx(vcpu)->ept_pointer;
-               } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
-                       to_kvm_vmx(kvm)->ept_pointers_match
-                               = EPT_POINTERS_MISMATCH;
-                       return;
-               }
-       }
-
-       to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
-}
-
 static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
                void *data)
 {
@@ -501,47 +482,70 @@ static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush
                        range->pages);
 }
 
-static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
-               struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
+static inline int hv_remote_flush_root_ept(hpa_t root_ept,
+                                          struct kvm_tlb_range *range)
 {
-       u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
-
-       /*
-        * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
-        * of the base of EPT PML4 table, strip off EPT configuration
-        * information.
-        */
        if (range)
-               return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
+               return hyperv_flush_guest_mapping_range(root_ept,
                                kvm_fill_hv_flush_list_func, (void *)range);
        else
-               return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
+               return hyperv_flush_guest_mapping(root_ept);
 }
 
 static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
                struct kvm_tlb_range *range)
 {
+       struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
        struct kvm_vcpu *vcpu;
-       int ret = 0, i;
+       int ret = 0, i, nr_unique_valid_roots;
+       hpa_t root;
 
-       spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+       spin_lock(&kvm_vmx->hv_root_ept_lock);
 
-       if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
-               check_ept_pointer_match(kvm);
+       if (!VALID_PAGE(kvm_vmx->hv_root_ept)) {
+               nr_unique_valid_roots = 0;
 
-       if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
+               /*
+                * Flush all valid roots, and see if all vCPUs have converged
+                * on a common root, in which case future flushes can skip the
+                * loop and flush the common root.
+                */
                kvm_for_each_vcpu(i, vcpu, kvm) {
-                       /* If ept_pointer is invalid pointer, bypass flush request. */
-                       if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
-                               ret |= __hv_remote_flush_tlb_with_range(
-                                       kvm, vcpu, range);
+                       root = to_vmx(vcpu)->hv_root_ept;
+                       if (!VALID_PAGE(root) || root == kvm_vmx->hv_root_ept)
+                               continue;
+
+                       /*
+                        * Set the tracked root to the first valid root.  Keep
+                        * this root for the entirety of the loop even if more
+                        * roots are encountered as a low effort optimization
+                        * to avoid flushing the same (first) root again.
+                        */
+                       if (++nr_unique_valid_roots == 1)
+                               kvm_vmx->hv_root_ept = root;
+
+                       if (!ret)
+                               ret = hv_remote_flush_root_ept(root, range);
+
+                       /*
+                        * Stop processing roots if a failure occurred and
+                        * multiple valid roots have already been detected.
+                        */
+                       if (ret && nr_unique_valid_roots > 1)
+                               break;
                }
+
+               /*
+                * The optimized flush of a single root can't be used if there
+                * are multiple valid roots (obviously).
+                */
+               if (nr_unique_valid_roots > 1)
+                       kvm_vmx->hv_root_ept = INVALID_PAGE;
        } else {
-               ret = __hv_remote_flush_tlb_with_range(kvm,
-                               kvm_get_vcpu(kvm, 0), range);
+               ret = hv_remote_flush_root_ept(kvm_vmx->hv_root_ept, range);
        }
 
-       spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+       spin_unlock(&kvm_vmx->hv_root_ept_lock);
        return ret;
 }
 static int hv_remote_flush_tlb(struct kvm *kvm)
@@ -559,7 +563,7 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
         * evmcs in singe VM shares same assist page.
         */
        if (!*p_hv_pa_pg)
-               *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+               *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
 
        if (!*p_hv_pa_pg)
                return -ENOMEM;
@@ -576,6 +580,21 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
 
 #endif /* IS_ENABLED(CONFIG_HYPERV) */
 
+static void hv_track_root_ept(struct kvm_vcpu *vcpu, hpa_t root_ept)
+{
+#if IS_ENABLED(CONFIG_HYPERV)
+       struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
+
+       if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
+               spin_lock(&kvm_vmx->hv_root_ept_lock);
+               to_vmx(vcpu)->hv_root_ept = root_ept;
+               if (root_ept != kvm_vmx->hv_root_ept)
+                       kvm_vmx->hv_root_ept = INVALID_PAGE;
+               spin_unlock(&kvm_vmx->hv_root_ept_lock);
+       }
+#endif
+}
+
 /*
  * Comment's format: document - errata name - stepping - processor name.
  * Refer from
@@ -1570,12 +1589,25 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
 
 static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
 {
+       /*
+        * Emulation of instructions in SGX enclaves is impossible as RIP does
+        * not point  tthe failing instruction, and even if it did, the code
+        * stream is inaccessible.  Inject #UD instead of exiting to userspace
+        * so that guest userspace can't DoS the guest simply by triggering
+        * emulation (enclaves are CPL3 only).
+        */
+       if (to_vmx(vcpu)->exit_reason.enclave_mode) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return false;
+       }
        return true;
 }
 
 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
+       union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
        unsigned long rip, orig_rip;
+       u32 instr_len;
 
        /*
         * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
@@ -1586,9 +1618,33 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
         * i.e. we end up advancing IP with some random value.
         */
        if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
-           to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+           exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+               instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+
+               /*
+                * Emulating an enclave's instructions isn't supported as KVM
+                * cannot access the enclave's memory or its true RIP, e.g. the
+                * vmcs.GUEST_RIP points at the exit point of the enclave, not
+                * the RIP that actually triggered the VM-Exit.  But, because
+                * most instructions that cause VM-Exit will #UD in an enclave,
+                * most instruction-based VM-Exits simply do not occur.
+                *
+                * There are a few exceptions, notably the debug instructions
+                * INT1ICEBRK and INT3, as they are allowed in debug enclaves
+                * and generate #DB/#BP as expected, which KVM might intercept.
+                * But again, the CPU does the dirty work and saves an instr
+                * length of zero so VMMs don't shoot themselves in the foot.
+                * WARN if KVM tries to skip a non-zero length instruction on
+                * a VM-Exit from an enclave.
+                */
+               if (!instr_len)
+                       goto rip_updated;
+
+               WARN(exit_reason.enclave_mode,
+                    "KVM: skipping instruction after SGX enclave VM-Exit");
+
                orig_rip = kvm_rip_read(vcpu);
-               rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+               rip = orig_rip + instr_len;
 #ifdef CONFIG_X86_64
                /*
                 * We need to mask out the high 32 bits of RIP if not in 64-bit
@@ -1604,6 +1660,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
                        return 0;
        }
 
+rip_updated:
        /* skipping an emulated instruction also counts */
        vmx_set_interrupt_shadow(vcpu, 0);
 
@@ -1865,6 +1922,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_FEAT_CTL:
                msr_info->data = vmx->msr_ia32_feature_control;
                break;
+       case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+                       return 1;
+               msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
+                       [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
+               break;
        case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
                if (!nested_vmx_allowed(vcpu))
                        return 1;
@@ -2158,6 +2222,29 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                vmx->msr_ia32_feature_control = data;
                if (msr_info->host_initiated && data == 0)
                        vmx_leave_nested(vcpu);
+
+               /* SGX may be enabled/disabled by guest's firmware */
+               vmx_write_encls_bitmap(vcpu, NULL);
+               break;
+       case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+               /*
+                * On real hardware, the LE hash MSRs are writable before
+                * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
+                * at which point SGX related bits in IA32_FEATURE_CONTROL
+                * become writable.
+                *
+                * KVM does not emulate SGX activation for simplicity, so
+                * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
+                * is unlocked.  This is technically not architectural
+                * behavior, but it's close enough.
+                */
+               if (!msr_info->host_initiated &&
+                   (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
+                   ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
+                   !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
+                       return 1;
+               vmx->msr_ia32_sgxlepubkeyhash
+                       [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
                break;
        case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
                if (!msr_info->host_initiated)
@@ -3088,8 +3175,7 @@ static int vmx_get_max_tdp_level(void)
        return 4;
 }
 
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
-                  int root_level)
+u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
 {
        u64 eptp = VMX_EPTP_MT_WB;
 
@@ -3098,13 +3184,13 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
        if (enable_ept_ad_bits &&
            (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
                eptp |= VMX_EPTP_AD_ENABLE_BIT;
-       eptp |= (root_hpa & PAGE_MASK);
+       eptp |= root_hpa;
 
        return eptp;
 }
 
-static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
-                            int pgd_level)
+static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+                            int root_level)
 {
        struct kvm *kvm = vcpu->kvm;
        bool update_guest_cr3 = true;
@@ -3112,16 +3198,10 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
        u64 eptp;
 
        if (enable_ept) {
-               eptp = construct_eptp(vcpu, pgd, pgd_level);
+               eptp = construct_eptp(vcpu, root_hpa, root_level);
                vmcs_write64(EPT_POINTER, eptp);
 
-               if (kvm_x86_ops.tlb_remote_flush) {
-                       spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
-                       to_vmx(vcpu)->ept_pointer = eptp;
-                       to_kvm_vmx(kvm)->ept_pointers_match
-                               = EPT_POINTERS_CHECK;
-                       spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
-               }
+               hv_track_root_ept(vcpu, root_hpa);
 
                if (!enable_unrestricted_guest && !is_paging(vcpu))
                        guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
@@ -3131,7 +3211,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
                        update_guest_cr3 = false;
                vmx_ept_load_pdptrs(vcpu);
        } else {
-               guest_cr3 = pgd;
+               guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
        }
 
        if (update_guest_cr3)
@@ -4314,15 +4394,6 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
        vmx->secondary_exec_control = exec_control;
 }
 
-static void ept_set_mmio_spte_mask(void)
-{
-       /*
-        * EPT Misconfigurations can be generated if the value of bits 2:0
-        * of an EPT paging-structure entry is 110b (write/execute).
-        */
-       kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0);
-}
-
 #define VMX_XSS_EXIT_BITMAP 0
 
 /*
@@ -4410,8 +4481,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
                vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
        }
 
-       if (cpu_has_vmx_encls_vmexit())
-               vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+       vmx_write_encls_bitmap(&vmx->vcpu, NULL);
 
        if (vmx_pt_mode_is_host_guest()) {
                memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
@@ -5184,17 +5254,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
        return 1;
 }
 
-static int handle_vmcall(struct kvm_vcpu *vcpu)
-{
-       return kvm_emulate_hypercall(vcpu);
-}
-
-static int handle_invd(struct kvm_vcpu *vcpu)
-{
-       /* Treat an INVD instruction as a NOP and just skip it. */
-       return kvm_skip_emulated_instruction(vcpu);
-}
-
 static int handle_invlpg(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
@@ -5203,28 +5262,6 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
        return kvm_skip_emulated_instruction(vcpu);
 }
 
-static int handle_rdpmc(struct kvm_vcpu *vcpu)
-{
-       int err;
-
-       err = kvm_rdpmc(vcpu);
-       return kvm_complete_insn_gp(vcpu, err);
-}
-
-static int handle_wbinvd(struct kvm_vcpu *vcpu)
-{
-       return kvm_emulate_wbinvd(vcpu);
-}
-
-static int handle_xsetbv(struct kvm_vcpu *vcpu)
-{
-       u64 new_bv = kvm_read_edx_eax(vcpu);
-       u32 index = kvm_rcx_read(vcpu);
-
-       int err = kvm_set_xcr(vcpu, index, new_bv);
-       return kvm_complete_insn_gp(vcpu, err);
-}
-
 static int handle_apic_access(struct kvm_vcpu *vcpu)
 {
        if (likely(fasteoi)) {
@@ -5384,6 +5421,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 {
        gpa_t gpa;
 
+       if (!vmx_can_emulate_instruction(vcpu, NULL, 0))
+               return 1;
+
        /*
         * A nested guest cannot optimize MMIO vmexits, because we have an
         * nGPA here instead of the required GPA.
@@ -5485,18 +5525,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
        }
 }
 
-static void vmx_enable_tdp(void)
-{
-       kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
-               enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
-               enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
-               0ull, VMX_EPT_EXECUTABLE_MASK,
-               cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
-               VMX_EPT_RWX_MASK, 0ull);
-
-       ept_set_mmio_spte_mask();
-}
-
 /*
  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
  * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
@@ -5516,34 +5544,11 @@ static int handle_pause(struct kvm_vcpu *vcpu)
        return kvm_skip_emulated_instruction(vcpu);
 }
 
-static int handle_nop(struct kvm_vcpu *vcpu)
-{
-       return kvm_skip_emulated_instruction(vcpu);
-}
-
-static int handle_mwait(struct kvm_vcpu *vcpu)
-{
-       printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
-       return handle_nop(vcpu);
-}
-
-static int handle_invalid_op(struct kvm_vcpu *vcpu)
-{
-       kvm_queue_exception(vcpu, UD_VECTOR);
-       return 1;
-}
-
 static int handle_monitor_trap(struct kvm_vcpu *vcpu)
 {
        return 1;
 }
 
-static int handle_monitor(struct kvm_vcpu *vcpu)
-{
-       printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
-       return handle_nop(vcpu);
-}
-
 static int handle_invpcid(struct kvm_vcpu *vcpu)
 {
        u32 vmx_instruction_info;
@@ -5632,16 +5637,18 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+#ifndef CONFIG_X86_SGX_KVM
 static int handle_encls(struct kvm_vcpu *vcpu)
 {
        /*
-        * SGX virtualization is not yet supported.  There is no software
-        * enable bit for SGX, so we have to trap ENCLS and inject a #UD
-        * to prevent the guest from executing ENCLS.
+        * SGX virtualization is disabled.  There is no software enable bit for
+        * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
+        * the guest from executing ENCLS (when SGX is supported by hardware).
         */
        kvm_queue_exception(vcpu, UD_VECTOR);
        return 1;
 }
+#endif /* CONFIG_X86_SGX_KVM */
 
 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
 {
@@ -5668,10 +5675,10 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_MSR_WRITE]               = kvm_emulate_wrmsr,
        [EXIT_REASON_INTERRUPT_WINDOW]        = handle_interrupt_window,
        [EXIT_REASON_HLT]                     = kvm_emulate_halt,
-       [EXIT_REASON_INVD]                    = handle_invd,
+       [EXIT_REASON_INVD]                    = kvm_emulate_invd,
        [EXIT_REASON_INVLPG]                  = handle_invlpg,
-       [EXIT_REASON_RDPMC]                   = handle_rdpmc,
-       [EXIT_REASON_VMCALL]                  = handle_vmcall,
+       [EXIT_REASON_RDPMC]                   = kvm_emulate_rdpmc,
+       [EXIT_REASON_VMCALL]                  = kvm_emulate_hypercall,
        [EXIT_REASON_VMCLEAR]                 = handle_vmx_instruction,
        [EXIT_REASON_VMLAUNCH]                = handle_vmx_instruction,
        [EXIT_REASON_VMPTRLD]                 = handle_vmx_instruction,
@@ -5685,8 +5692,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
        [EXIT_REASON_APIC_WRITE]              = handle_apic_write,
        [EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
-       [EXIT_REASON_WBINVD]                  = handle_wbinvd,
-       [EXIT_REASON_XSETBV]                  = handle_xsetbv,
+       [EXIT_REASON_WBINVD]                  = kvm_emulate_wbinvd,
+       [EXIT_REASON_XSETBV]                  = kvm_emulate_xsetbv,
        [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
        [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
        [EXIT_REASON_GDTR_IDTR]               = handle_desc,
@@ -5694,13 +5701,13 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
        [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
        [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
-       [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_mwait,
+       [EXIT_REASON_MWAIT_INSTRUCTION]       = kvm_emulate_mwait,
        [EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
-       [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
+       [EXIT_REASON_MONITOR_INSTRUCTION]     = kvm_emulate_monitor,
        [EXIT_REASON_INVEPT]                  = handle_vmx_instruction,
        [EXIT_REASON_INVVPID]                 = handle_vmx_instruction,
-       [EXIT_REASON_RDRAND]                  = handle_invalid_op,
-       [EXIT_REASON_RDSEED]                  = handle_invalid_op,
+       [EXIT_REASON_RDRAND]                  = kvm_handle_invalid_op,
+       [EXIT_REASON_RDSEED]                  = kvm_handle_invalid_op,
        [EXIT_REASON_PML_FULL]                = handle_pml_full,
        [EXIT_REASON_INVPCID]                 = handle_invpcid,
        [EXIT_REASON_VMFUNC]                  = handle_vmx_instruction,
@@ -5787,12 +5794,23 @@ static void vmx_dump_dtsel(char *name, uint32_t limit)
               vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
 }
 
-void dump_vmcs(void)
+static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
+{
+       unsigned int i;
+       struct vmx_msr_entry *e;
+
+       pr_err("MSR %s:\n", name);
+       for (i = 0, e = m->val; i < m->nr; ++i, ++e)
+               pr_err("  %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
+}
+
+void dump_vmcs(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 vmentry_ctl, vmexit_ctl;
        u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
        unsigned long cr4;
-       u64 efer;
+       int efer_slot;
 
        if (!dump_invalid_vmcs) {
                pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
@@ -5804,7 +5822,6 @@ void dump_vmcs(void)
        cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
        pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
        cr4 = vmcs_readl(GUEST_CR4);
-       efer = vmcs_read64(GUEST_IA32_EFER);
        secondary_exec_control = 0;
        if (cpu_has_secondary_exec_ctrls())
                secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
@@ -5816,9 +5833,7 @@ void dump_vmcs(void)
        pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
               cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
        pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
-       if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
-           (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
-       {
+       if (cpu_has_vmx_ept()) {
                pr_err("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
                       vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
                pr_err("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
@@ -5841,10 +5856,20 @@ void dump_vmcs(void)
        vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
        vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
        vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
-       if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
-           (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
-               pr_err("EFER =     0x%016llx  PAT = 0x%016llx\n",
-                      efer, vmcs_read64(GUEST_IA32_PAT));
+       efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
+       if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
+               pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
+       else if (efer_slot >= 0)
+               pr_err("EFER= 0x%016llx (autoload)\n",
+                      vmx->msr_autoload.guest.val[efer_slot].value);
+       else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
+               pr_err("EFER= 0x%016llx (effective)\n",
+                      vcpu->arch.efer | (EFER_LMA | EFER_LME));
+       else
+               pr_err("EFER= 0x%016llx (effective)\n",
+                      vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
+       if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
+               pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
        pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
               vmcs_read64(GUEST_IA32_DEBUGCTL),
               vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
@@ -5860,6 +5885,10 @@ void dump_vmcs(void)
        if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
                pr_err("InterruptStatus = %04x\n",
                       vmcs_read16(GUEST_INTR_STATUS));
+       if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
+               vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
+       if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
+               vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
 
        pr_err("*** Host State ***\n");
        pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
@@ -5881,14 +5910,16 @@ void dump_vmcs(void)
               vmcs_readl(HOST_IA32_SYSENTER_ESP),
               vmcs_read32(HOST_IA32_SYSENTER_CS),
               vmcs_readl(HOST_IA32_SYSENTER_EIP));
-       if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
-               pr_err("EFER = 0x%016llx  PAT = 0x%016llx\n",
-                      vmcs_read64(HOST_IA32_EFER),
-                      vmcs_read64(HOST_IA32_PAT));
+       if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
+               pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
+       if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
+               pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
        if (cpu_has_load_perf_global_ctrl() &&
            vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
                pr_err("PerfGlobCtl = 0x%016llx\n",
                       vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
+       if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
+               vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
 
        pr_err("*** Control State ***\n");
        pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
@@ -5997,7 +6028,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
        }
 
        if (exit_reason.failed_vmentry) {
-               dump_vmcs();
+               dump_vmcs(vcpu);
                vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                vcpu->run->fail_entry.hardware_entry_failure_reason
                        = exit_reason.full;
@@ -6006,7 +6037,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
        }
 
        if (unlikely(vmx->fail)) {
-               dump_vmcs();
+               dump_vmcs(vcpu);
                vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                vcpu->run->fail_entry.hardware_entry_failure_reason
                        = vmcs_read32(VM_INSTRUCTION_ERROR);
@@ -6092,7 +6123,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
 unexpected_vmexit:
        vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
                    exit_reason.full);
-       dump_vmcs();
+       dump_vmcs(vcpu);
        vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
        vcpu->run->internal.suberror =
                        KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
@@ -6976,6 +7007,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
        else
                memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
 
+       vcpu_setup_sgx_lepubkeyhash(vcpu);
+
        vmx->nested.posted_intr_nv = -1;
        vmx->nested.current_vmptr = -1ull;
 
@@ -6989,8 +7022,9 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
        vmx->pi_desc.nv = POSTED_INTR_VECTOR;
        vmx->pi_desc.sn = 1;
 
-       vmx->ept_pointer = INVALID_PAGE;
-
+#if IS_ENABLED(CONFIG_HYPERV)
+       vmx->hv_root_ept = INVALID_PAGE;
+#endif
        return 0;
 
 free_vmcs:
@@ -7007,7 +7041,9 @@ free_vpid:
 
 static int vmx_vm_init(struct kvm *kvm)
 {
-       spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
+#if IS_ENABLED(CONFIG_HYPERV)
+       spin_lock_init(&to_kvm_vmx(kvm)->hv_root_ept_lock);
+#endif
 
        if (!ple_gap)
                kvm->arch.pause_in_guest = true;
@@ -7302,6 +7338,19 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 
        set_cr4_guest_host_mask(vmx);
 
+       vmx_write_encls_bitmap(vcpu, NULL);
+       if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+               vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
+       else
+               vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
+
+       if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+               vmx->msr_ia32_feature_control_valid_bits |=
+                       FEAT_CTL_SGX_LC_ENABLED;
+       else
+               vmx->msr_ia32_feature_control_valid_bits &=
+                       ~FEAT_CTL_SGX_LC_ENABLED;
+
        /* Refresh #PF interception to account for MAXPHYADDR changes. */
        vmx_update_exception_bitmap(vcpu);
 }
@@ -7322,6 +7371,13 @@ static __init void vmx_set_cpu_caps(void)
        if (vmx_pt_mode_is_host_guest())
                kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
 
+       if (!enable_sgx) {
+               kvm_cpu_cap_clear(X86_FEATURE_SGX);
+               kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
+               kvm_cpu_cap_clear(X86_FEATURE_SGX1);
+               kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+       }
+
        if (vmx_umip_emulated())
                kvm_cpu_cap_set(X86_FEATURE_UMIP);
 
@@ -7848,7 +7904,8 @@ static __init int hardware_setup(void)
        set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
        if (enable_ept)
-               vmx_enable_tdp();
+               kvm_mmu_set_ept_masks(enable_ept_ad_bits,
+                                     cpu_has_vmx_ept_execute_only());
 
        if (!enable_ept)
                ept_lpage_level = 0;
@@ -7909,6 +7966,8 @@ static __init int hardware_setup(void)
        if (!enable_ept || !cpu_has_vmx_intel_pt())
                pt_mode = PT_MODE_SYSTEM;
 
+       setup_default_sgx_lepubkeyhash();
+
        if (nested) {
                nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
                                           vmx_capability.ept);
index 89da5e1..19fe09f 100644 (file)
@@ -325,7 +325,12 @@ struct vcpu_vmx {
         */
        u64 msr_ia32_feature_control;
        u64 msr_ia32_feature_control_valid_bits;
-       u64 ept_pointer;
+       /* SGX Launch Control public key hash */
+       u64 msr_ia32_sgxlepubkeyhash[4];
+
+#if IS_ENABLED(CONFIG_HYPERV)
+       u64 hv_root_ept;
+#endif
 
        struct pt_desc pt_desc;
        struct lbr_desc lbr_desc;
@@ -338,12 +343,6 @@ struct vcpu_vmx {
        } shadow_msr_intercept;
 };
 
-enum ept_pointers_status {
-       EPT_POINTERS_CHECK = 0,
-       EPT_POINTERS_MATCH = 1,
-       EPT_POINTERS_MISMATCH = 2
-};
-
 struct kvm_vmx {
        struct kvm kvm;
 
@@ -351,8 +350,10 @@ struct kvm_vmx {
        bool ept_identity_pagetable_done;
        gpa_t ept_identity_map_addr;
 
-       enum ept_pointers_status ept_pointers_match;
-       spinlock_t ept_pointer_lock;
+#if IS_ENABLED(CONFIG_HYPERV)
+       hpa_t hv_root_ept;
+       spinlock_t hv_root_ept_lock;
+#endif
 };
 
 bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
@@ -376,8 +377,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
 void ept_save_pdptrs(struct kvm_vcpu *vcpu);
 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
-                  int root_level);
+u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
 
 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
 void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
@@ -543,6 +543,6 @@ static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu)
        return is_unrestricted_guest(vcpu) || __vmx_guest_state_valid(vcpu);
 }
 
-void dump_vmcs(void);
+void dump_vmcs(struct kvm_vcpu *vcpu);
 
 #endif /* __KVM_X86_VMX_H */
index 692b0c3..164b64f 100644 (file)
@@ -37,6 +37,10 @@ static __always_inline void vmcs_check32(unsigned long field)
 {
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
                         "32-bit accessor invalid for 16-bit field");
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+                        "32-bit accessor invalid for 64-bit field");
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+                        "32-bit accessor invalid for 64-bit high field");
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
                         "32-bit accessor invalid for natural width field");
 }
index eca6362..f0d0b6e 100644 (file)
@@ -75,6 +75,7 @@
 #include <asm/tlbflush.h>
 #include <asm/intel_pt.h>
 #include <asm/emulate_prefix.h>
+#include <asm/sgx.h>
 #include <clocksource/hyperv_timer.h>
 
 #define CREATE_TRACE_POINTS
@@ -245,6 +246,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        VCPU_STAT("l1d_flush", l1d_flush),
        VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
        VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+       VCPU_STAT("nested_run", nested_run),
+       VCPU_STAT("directed_yield_attempted", directed_yield_attempted),
+       VCPU_STAT("directed_yield_successful", directed_yield_successful),
        VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
        VM_STAT("mmu_pte_write", mmu_pte_write),
        VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
@@ -543,8 +547,6 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 
        if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
        queue:
-               if (has_error && !is_protmode(vcpu))
-                       has_error = false;
                if (reinject) {
                        /*
                         * On vmentry, vcpu->arch.exception.pending is only
@@ -983,14 +985,17 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
        return 0;
 }
 
-int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
 {
-       if (static_call(kvm_x86_get_cpl)(vcpu) == 0)
-               return __kvm_set_xcr(vcpu, index, xcr);
+       if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||
+           __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
 
-       return 1;
+       return kvm_skip_emulated_instruction(vcpu);
 }
-EXPORT_SYMBOL_GPL(kvm_set_xcr);
+EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv);
 
 bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
@@ -1191,20 +1196,21 @@ void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 }
 EXPORT_SYMBOL_GPL(kvm_get_dr);
 
-bool kvm_rdpmc(struct kvm_vcpu *vcpu)
+int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
 {
        u32 ecx = kvm_rcx_read(vcpu);
        u64 data;
-       int err;
 
-       err = kvm_pmu_rdpmc(vcpu, ecx, &data);
-       if (err)
-               return err;
+       if (kvm_pmu_rdpmc(vcpu, ecx, &data)) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
        kvm_rax_write(vcpu, (u32)data);
        kvm_rdx_write(vcpu, data >> 32);
-       return err;
+       return kvm_skip_emulated_instruction(vcpu);
 }
-EXPORT_SYMBOL_GPL(kvm_rdpmc);
+EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
 
 /*
  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -1791,6 +1797,40 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
 
+int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
+{
+       return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_as_nop);
+
+int kvm_emulate_invd(struct kvm_vcpu *vcpu)
+{
+       /* Treat an INVD instruction as a NOP and just skip it. */
+       return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_invd);
+
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
+{
+       pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n");
+       return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
+
+int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
+{
+       kvm_queue_exception(vcpu, UD_VECTOR);
+       return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_handle_invalid_op);
+
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
+{
+       pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n");
+       return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_monitor);
+
 static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
 {
        xfer_to_guest_mode_prepare();
@@ -3382,6 +3422,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                msr_info->data = 0;
                break;
        case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+               if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+                       return kvm_pmu_get_msr(vcpu, msr_info);
+               if (!msr_info->host_initiated)
+                       return 1;
+               msr_info->data = 0;
+               break;
        case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
        case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
        case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
@@ -3771,8 +3817,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_X86_USER_SPACE_MSR:
        case KVM_CAP_X86_MSR_FILTER:
        case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
+#ifdef CONFIG_X86_SGX_KVM
+       case KVM_CAP_SGX_ATTRIBUTE:
+#endif
+       case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
                r = 1;
                break;
+       case KVM_CAP_SET_GUEST_DEBUG2:
+               return KVM_GUESTDBG_VALID_MASK;
 #ifdef CONFIG_KVM_XEN
        case KVM_CAP_XEN_HVM:
                r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
@@ -4675,7 +4727,6 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
                        kvm_update_pv_runtime(vcpu);
 
                return 0;
-
        default:
                return -EINVAL;
        }
@@ -5357,6 +5408,28 @@ split_irqchip_unlock:
                        kvm->arch.bus_lock_detection_enabled = true;
                r = 0;
                break;
+#ifdef CONFIG_X86_SGX_KVM
+       case KVM_CAP_SGX_ATTRIBUTE: {
+               unsigned long allowed_attributes = 0;
+
+               r = sgx_set_attribute(&allowed_attributes, cap->args[0]);
+               if (r)
+                       break;
+
+               /* KVM only supports the PROVISIONKEY privileged attribute. */
+               if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) &&
+                   !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY))
+                       kvm->arch.sgx_provisioning_allowed = true;
+               else
+                       r = -EINVAL;
+               break;
+       }
+#endif
+       case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+               r = -EINVAL;
+               if (kvm_x86_ops.vm_copy_enc_context_from)
+                       r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]);
+               return r;
        default:
                r = -EINVAL;
                break;
@@ -6001,6 +6074,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
        u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
 
  gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
                                struct x86_exception *exception)
@@ -6017,6 +6091,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
        access |= PFERR_WRITE_MASK;
        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
 
 /* uses this to access any guest's mapped memory without checking CPL */
 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
@@ -8045,9 +8120,6 @@ int kvm_arch_init(void *opaque)
        if (r)
                goto out_free_percpu;
 
-       kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
-                       PT_DIRTY_MASK, PT64_NX_MASK, 0,
-                       PT_PRESENT_MASK, 0, sme_me_mask);
        kvm_timer_init();
 
        perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@ -8207,21 +8279,35 @@ void kvm_apicv_init(struct kvm *kvm, bool enable)
 }
 EXPORT_SYMBOL_GPL(kvm_apicv_init);
 
-static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
+static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
 {
        struct kvm_vcpu *target = NULL;
        struct kvm_apic_map *map;
 
+       vcpu->stat.directed_yield_attempted++;
+
        rcu_read_lock();
-       map = rcu_dereference(kvm->arch.apic_map);
+       map = rcu_dereference(vcpu->kvm->arch.apic_map);
 
        if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
                target = map->phys_map[dest_id]->vcpu;
 
        rcu_read_unlock();
 
-       if (target && READ_ONCE(target->ready))
-               kvm_vcpu_yield_to(target);
+       if (!target || !READ_ONCE(target->ready))
+               goto no_yield;
+
+       /* Ignore requests to yield to self */
+       if (vcpu == target)
+               goto no_yield;
+
+       if (kvm_vcpu_yield_to(target) <= 0)
+               goto no_yield;
+
+       vcpu->stat.directed_yield_successful++;
+
+no_yield:
+       return;
 }
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
@@ -8268,7 +8354,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
                        break;
 
                kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
-               kvm_sched_yield(vcpu->kvm, a1);
+               kvm_sched_yield(vcpu, a1);
                ret = 0;
                break;
 #ifdef CONFIG_X86_64
@@ -8286,7 +8372,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
                if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
                        break;
 
-               kvm_sched_yield(vcpu->kvm, a0);
+               kvm_sched_yield(vcpu, a0);
                ret = 0;
                break;
        default:
@@ -8369,6 +8455,27 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
        static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr);
 }
 
+
+int kvm_check_nested_events(struct kvm_vcpu *vcpu)
+{
+       if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
+               return -EIO;
+
+       if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+               kvm_x86_ops.nested_ops->triple_fault(vcpu);
+               return 1;
+       }
+
+       return kvm_x86_ops.nested_ops->check_events(vcpu);
+}
+
+static void kvm_inject_exception(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
+               vcpu->arch.exception.error_code = false;
+       static_call(kvm_x86_queue_exception)(vcpu);
+}
+
 static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
 {
        int r;
@@ -8377,7 +8484,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
        /* try to reinject previous events if any */
 
        if (vcpu->arch.exception.injected) {
-               static_call(kvm_x86_queue_exception)(vcpu);
+               kvm_inject_exception(vcpu);
                can_inject = false;
        }
        /*
@@ -8414,7 +8521,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
         * from L2 to L1.
         */
        if (is_guest_mode(vcpu)) {
-               r = kvm_x86_ops.nested_ops->check_events(vcpu);
+               r = kvm_check_nested_events(vcpu);
                if (r < 0)
                        goto busy;
        }
@@ -8440,7 +8547,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
                        }
                }
 
-               static_call(kvm_x86_queue_exception)(vcpu);
+               kvm_inject_exception(vcpu);
                can_inject = false;
        }
 
@@ -8977,10 +9084,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        goto out;
                }
                if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
-                       vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
-                       vcpu->mmio_needed = 0;
-                       r = 0;
-                       goto out;
+                       if (is_guest_mode(vcpu)) {
+                               kvm_x86_ops.nested_ops->triple_fault(vcpu);
+                       } else {
+                               vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+                               vcpu->mmio_needed = 0;
+                               r = 0;
+                               goto out;
+                       }
                }
                if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
                        /* Page is swapped out. Do synthetic halt */
@@ -9278,7 +9389,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
 {
        if (is_guest_mode(vcpu))
-               kvm_x86_ops.nested_ops->check_events(vcpu);
+               kvm_check_nested_events(vcpu);
 
        return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                !vcpu->arch.apf.halted);
@@ -11020,6 +11131,14 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
        return false;
 }
 
+bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
+               return true;
+
+       return false;
+}
+
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.preempted_in_kernel;
@@ -11541,7 +11660,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
 
                fallthrough;
        case INVPCID_TYPE_ALL_INCL_GLOBAL:
-               kvm_mmu_unload(vcpu);
+               kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
                return kvm_skip_emulated_instruction(vcpu);
 
        default:
index 9035e34..5334bf4 100644 (file)
@@ -8,6 +8,14 @@
 #include "kvm_cache_regs.h"
 #include "kvm_emulate.h"
 
+#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check)                \
+({                                                                     \
+       bool failed = (consistency_check);                              \
+       if (failed)                                                     \
+               trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
+       failed;                                                         \
+})
+
 #define KVM_DEFAULT_PLE_GAP            128
 #define KVM_VMX_DEFAULT_PLE_WINDOW     4096
 #define KVM_DEFAULT_PLE_WINDOW_GROW    2
@@ -48,6 +56,8 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
 
 #define MSR_IA32_CR_PAT_DEFAULT  0x0007040600070406ULL
 
+int kvm_check_nested_events(struct kvm_vcpu *vcpu);
+
 static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
 {
        vcpu->arch.exception.pending = false;
index cb9b4c4..6ee7031 100644 (file)
@@ -129,6 +129,7 @@ static int sev_cmd_buffer_len(int cmd)
        case SEV_CMD_DOWNLOAD_FIRMWARE:         return sizeof(struct sev_data_download_firmware);
        case SEV_CMD_GET_ID:                    return sizeof(struct sev_data_get_id);
        case SEV_CMD_ATTESTATION_REPORT:        return sizeof(struct sev_data_attestation_report);
+       case SEV_CMD_SEND_CANCEL:                       return sizeof(struct sev_data_send_cancel);
        default:                                return 0;
        }
 
@@ -141,6 +142,7 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
        struct sev_device *sev;
        unsigned int phys_lsb, phys_msb;
        unsigned int reg, ret = 0;
+       int buf_len;
 
        if (!psp || !psp->sev_data)
                return -ENODEV;
@@ -150,15 +152,27 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
 
        sev = psp->sev_data;
 
+       buf_len = sev_cmd_buffer_len(cmd);
+       if (WARN_ON_ONCE(!data != !buf_len))
+               return -EINVAL;
+
+       /*
+        * Copy the incoming data to driver's scratch buffer as __pa() will not
+        * work for some memory, e.g. vmalloc'd addresses, and @data may not be
+        * physically contiguous.
+        */
+       if (data)
+               memcpy(sev->cmd_buf, data, buf_len);
+
        /* Get the physical address of the command buffer */
-       phys_lsb = data ? lower_32_bits(__psp_pa(data)) : 0;
-       phys_msb = data ? upper_32_bits(__psp_pa(data)) : 0;
+       phys_lsb = data ? lower_32_bits(__psp_pa(sev->cmd_buf)) : 0;
+       phys_msb = data ? upper_32_bits(__psp_pa(sev->cmd_buf)) : 0;
 
        dev_dbg(sev->dev, "sev command id %#x buffer 0x%08x%08x timeout %us\n",
                cmd, phys_msb, phys_lsb, psp_timeout);
 
        print_hex_dump_debug("(in):  ", DUMP_PREFIX_OFFSET, 16, 2, data,
-                            sev_cmd_buffer_len(cmd), false);
+                            buf_len, false);
 
        iowrite32(phys_lsb, sev->io_regs + sev->vdata->cmdbuff_addr_lo_reg);
        iowrite32(phys_msb, sev->io_regs + sev->vdata->cmdbuff_addr_hi_reg);
@@ -194,7 +208,14 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
        }
 
        print_hex_dump_debug("(out): ", DUMP_PREFIX_OFFSET, 16, 2, data,
-                            sev_cmd_buffer_len(cmd), false);
+                            buf_len, false);
+
+       /*
+        * Copy potential output from the PSP back to data.  Do this even on
+        * failure in case the caller wants to glean something from the error.
+        */
+       if (data)
+               memcpy(data, sev->cmd_buf, buf_len);
 
        return ret;
 }
@@ -213,6 +234,7 @@ static int sev_do_cmd(int cmd, void *data, int *psp_ret)
 static int __sev_platform_init_locked(int *error)
 {
        struct psp_device *psp = psp_master;
+       struct sev_data_init data;
        struct sev_device *sev;
        int rc = 0;
 
@@ -224,6 +246,7 @@ static int __sev_platform_init_locked(int *error)
        if (sev->state == SEV_STATE_INIT)
                return 0;
 
+       memset(&data, 0, sizeof(data));
        if (sev_es_tmr) {
                u64 tmr_pa;
 
@@ -233,12 +256,12 @@ static int __sev_platform_init_locked(int *error)
                 */
                tmr_pa = __pa(sev_es_tmr);
 
-               sev->init_cmd_buf.flags |= SEV_INIT_FLAGS_SEV_ES;
-               sev->init_cmd_buf.tmr_address = tmr_pa;
-               sev->init_cmd_buf.tmr_len = SEV_ES_TMR_SIZE;
+               data.flags |= SEV_INIT_FLAGS_SEV_ES;
+               data.tmr_address = tmr_pa;
+               data.tmr_len = SEV_ES_TMR_SIZE;
        }
 
-       rc = __sev_do_cmd_locked(SEV_CMD_INIT, &sev->init_cmd_buf, error);
+       rc = __sev_do_cmd_locked(SEV_CMD_INIT, &data, error);
        if (rc)
                return rc;
 
@@ -295,15 +318,14 @@ static int sev_platform_shutdown(int *error)
 
 static int sev_get_platform_state(int *state, int *error)
 {
-       struct sev_device *sev = psp_master->sev_data;
+       struct sev_user_data_status data;
        int rc;
 
-       rc = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS,
-                                &sev->status_cmd_buf, error);
+       rc = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS, &data, error);
        if (rc)
                return rc;
 
-       *state = sev->status_cmd_buf.state;
+       *state = data.state;
        return rc;
 }
 
@@ -341,15 +363,14 @@ static int sev_ioctl_do_reset(struct sev_issue_cmd *argp, bool writable)
 
 static int sev_ioctl_do_platform_status(struct sev_issue_cmd *argp)
 {
-       struct sev_device *sev = psp_master->sev_data;
-       struct sev_user_data_status *data = &sev->status_cmd_buf;
+       struct sev_user_data_status data;
        int ret;
 
-       ret = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS, data, &argp->error);
+       ret = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS, &data, &argp->error);
        if (ret)
                return ret;
 
-       if (copy_to_user((void __user *)argp->data, data, sizeof(*data)))
+       if (copy_to_user((void __user *)argp->data, &data, sizeof(data)))
                ret = -EFAULT;
 
        return ret;
@@ -376,7 +397,7 @@ static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp, bool writable)
 {
        struct sev_device *sev = psp_master->sev_data;
        struct sev_user_data_pek_csr input;
-       struct sev_data_pek_csr *data;
+       struct sev_data_pek_csr data;
        void __user *input_address;
        void *blob = NULL;
        int ret;
@@ -387,9 +408,7 @@ static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp, bool writable)
        if (copy_from_user(&input, (void __user *)argp->data, sizeof(input)))
                return -EFAULT;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               return -ENOMEM;
+       memset(&data, 0, sizeof(data));
 
        /* userspace wants to query CSR length */
        if (!input.address || !input.length)
@@ -397,19 +416,15 @@ static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp, bool writable)
 
        /* allocate a physically contiguous buffer to store the CSR blob */
        input_address = (void __user *)input.address;
-       if (input.length > SEV_FW_BLOB_MAX_SIZE) {
-               ret = -EFAULT;
-               goto e_free;
-       }
+       if (input.length > SEV_FW_BLOB_MAX_SIZE)
+               return -EFAULT;
 
        blob = kmalloc(input.length, GFP_KERNEL);
-       if (!blob) {
-               ret = -ENOMEM;
-               goto e_free;
-       }
+       if (!blob)
+               return -ENOMEM;
 
-       data->address = __psp_pa(blob);
-       data->len = input.length;
+       data.address = __psp_pa(blob);
+       data.len = input.length;
 
 cmd:
        if (sev->state == SEV_STATE_UNINIT) {
@@ -418,10 +433,10 @@ cmd:
                        goto e_free_blob;
        }
 
-       ret = __sev_do_cmd_locked(SEV_CMD_PEK_CSR, data, &argp->error);
+       ret = __sev_do_cmd_locked(SEV_CMD_PEK_CSR, &data, &argp->error);
 
         /* If we query the CSR length, FW responded with expected data. */
-       input.length = data->len;
+       input.length = data.len;
 
        if (copy_to_user((void __user *)argp->data, &input, sizeof(input))) {
                ret = -EFAULT;
@@ -435,8 +450,6 @@ cmd:
 
 e_free_blob:
        kfree(blob);
-e_free:
-       kfree(data);
        return ret;
 }
 
@@ -456,21 +469,20 @@ EXPORT_SYMBOL_GPL(psp_copy_user_blob);
 static int sev_get_api_version(void)
 {
        struct sev_device *sev = psp_master->sev_data;
-       struct sev_user_data_status *status;
+       struct sev_user_data_status status;
        int error = 0, ret;
 
-       status = &sev->status_cmd_buf;
-       ret = sev_platform_status(status, &error);
+       ret = sev_platform_status(&status, &error);
        if (ret) {
                dev_err(sev->dev,
                        "SEV: failed to get status. Error: %#x\n", error);
                return 1;
        }
 
-       sev->api_major = status->api_major;
-       sev->api_minor = status->api_minor;
-       sev->build = status->build;
-       sev->state = status->state;
+       sev->api_major = status.api_major;
+       sev->api_minor = status.api_minor;
+       sev->build = status.build;
+       sev->state = status.state;
 
        return 0;
 }
@@ -568,7 +580,7 @@ static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp, bool writable)
 {
        struct sev_device *sev = psp_master->sev_data;
        struct sev_user_data_pek_cert_import input;
-       struct sev_data_pek_cert_import *data;
+       struct sev_data_pek_cert_import data;
        void *pek_blob, *oca_blob;
        int ret;
 
@@ -578,19 +590,14 @@ static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp, bool writable)
        if (copy_from_user(&input, (void __user *)argp->data, sizeof(input)))
                return -EFAULT;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               return -ENOMEM;
-
        /* copy PEK certificate blobs from userspace */
        pek_blob = psp_copy_user_blob(input.pek_cert_address, input.pek_cert_len);
-       if (IS_ERR(pek_blob)) {
-               ret = PTR_ERR(pek_blob);
-               goto e_free;
-       }
+       if (IS_ERR(pek_blob))
+               return PTR_ERR(pek_blob);
 
-       data->pek_cert_address = __psp_pa(pek_blob);
-       data->pek_cert_len = input.pek_cert_len;
+       data.reserved = 0;
+       data.pek_cert_address = __psp_pa(pek_blob);
+       data.pek_cert_len = input.pek_cert_len;
 
        /* copy PEK certificate blobs from userspace */
        oca_blob = psp_copy_user_blob(input.oca_cert_address, input.oca_cert_len);
@@ -599,8 +606,8 @@ static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp, bool writable)
                goto e_free_pek;
        }
 
-       data->oca_cert_address = __psp_pa(oca_blob);
-       data->oca_cert_len = input.oca_cert_len;
+       data.oca_cert_address = __psp_pa(oca_blob);
+       data.oca_cert_len = input.oca_cert_len;
 
        /* If platform is not in INIT state then transition it to INIT */
        if (sev->state != SEV_STATE_INIT) {
@@ -609,21 +616,19 @@ static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp, bool writable)
                        goto e_free_oca;
        }
 
-       ret = __sev_do_cmd_locked(SEV_CMD_PEK_CERT_IMPORT, data, &argp->error);
+       ret = __sev_do_cmd_locked(SEV_CMD_PEK_CERT_IMPORT, &data, &argp->error);
 
 e_free_oca:
        kfree(oca_blob);
 e_free_pek:
        kfree(pek_blob);
-e_free:
-       kfree(data);
        return ret;
 }
 
 static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp)
 {
        struct sev_user_data_get_id2 input;
-       struct sev_data_get_id *data;
+       struct sev_data_get_id data;
        void __user *input_address;
        void *id_blob = NULL;
        int ret;
@@ -637,28 +642,25 @@ static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp)
 
        input_address = (void __user *)input.address;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               return -ENOMEM;
-
        if (input.address && input.length) {
                id_blob = kmalloc(input.length, GFP_KERNEL);
-               if (!id_blob) {
-                       kfree(data);
+               if (!id_blob)
                        return -ENOMEM;
-               }
 
-               data->address = __psp_pa(id_blob);
-               data->len = input.length;
+               data.address = __psp_pa(id_blob);
+               data.len = input.length;
+       } else {
+               data.address = 0;
+               data.len = 0;
        }
 
-       ret = __sev_do_cmd_locked(SEV_CMD_GET_ID, data, &argp->error);
+       ret = __sev_do_cmd_locked(SEV_CMD_GET_ID, &data, &argp->error);
 
        /*
         * Firmware will return the length of the ID value (either the minimum
         * required length or the actual length written), return it to the user.
         */
-       input.length = data->len;
+       input.length = data.len;
 
        if (copy_to_user((void __user *)argp->data, &input, sizeof(input))) {
                ret = -EFAULT;
@@ -666,7 +668,7 @@ static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp)
        }
 
        if (id_blob) {
-               if (copy_to_user(input_address, id_blob, data->len)) {
+               if (copy_to_user(input_address, id_blob, data.len)) {
                        ret = -EFAULT;
                        goto e_free;
                }
@@ -674,7 +676,6 @@ static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp)
 
 e_free:
        kfree(id_blob);
-       kfree(data);
 
        return ret;
 }
@@ -724,7 +725,7 @@ static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp, bool writable)
        struct sev_device *sev = psp_master->sev_data;
        struct sev_user_data_pdh_cert_export input;
        void *pdh_blob = NULL, *cert_blob = NULL;
-       struct sev_data_pdh_cert_export *data;
+       struct sev_data_pdh_cert_export data;
        void __user *input_cert_chain_address;
        void __user *input_pdh_cert_address;
        int ret;
@@ -742,9 +743,7 @@ static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp, bool writable)
        if (copy_from_user(&input, (void __user *)argp->data, sizeof(input)))
                return -EFAULT;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               return -ENOMEM;
+       memset(&data, 0, sizeof(data));
 
        /* Userspace wants to query the certificate length. */
        if (!input.pdh_cert_address ||
@@ -756,25 +755,19 @@ static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp, bool writable)
        input_cert_chain_address = (void __user *)input.cert_chain_address;
 
        /* Allocate a physically contiguous buffer to store the PDH blob. */
-       if (input.pdh_cert_len > SEV_FW_BLOB_MAX_SIZE) {
-               ret = -EFAULT;
-               goto e_free;
-       }
+       if (input.pdh_cert_len > SEV_FW_BLOB_MAX_SIZE)
+               return -EFAULT;
 
        /* Allocate a physically contiguous buffer to store the cert chain blob. */
-       if (input.cert_chain_len > SEV_FW_BLOB_MAX_SIZE) {
-               ret = -EFAULT;
-               goto e_free;
-       }
+       if (input.cert_chain_len > SEV_FW_BLOB_MAX_SIZE)
+               return -EFAULT;
 
        pdh_blob = kmalloc(input.pdh_cert_len, GFP_KERNEL);
-       if (!pdh_blob) {
-               ret = -ENOMEM;
-               goto e_free;
-       }
+       if (!pdh_blob)
+               return -ENOMEM;
 
-       data->pdh_cert_address = __psp_pa(pdh_blob);
-       data->pdh_cert_len = input.pdh_cert_len;
+       data.pdh_cert_address = __psp_pa(pdh_blob);
+       data.pdh_cert_len = input.pdh_cert_len;
 
        cert_blob = kmalloc(input.cert_chain_len, GFP_KERNEL);
        if (!cert_blob) {
@@ -782,15 +775,15 @@ static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp, bool writable)
                goto e_free_pdh;
        }
 
-       data->cert_chain_address = __psp_pa(cert_blob);
-       data->cert_chain_len = input.cert_chain_len;
+       data.cert_chain_address = __psp_pa(cert_blob);
+       data.cert_chain_len = input.cert_chain_len;
 
 cmd:
-       ret = __sev_do_cmd_locked(SEV_CMD_PDH_CERT_EXPORT, data, &argp->error);
+       ret = __sev_do_cmd_locked(SEV_CMD_PDH_CERT_EXPORT, &data, &argp->error);
 
        /* If we query the length, FW responded with expected data. */
-       input.cert_chain_len = data->cert_chain_len;
-       input.pdh_cert_len = data->pdh_cert_len;
+       input.cert_chain_len = data.cert_chain_len;
+       input.pdh_cert_len = data.pdh_cert_len;
 
        if (copy_to_user((void __user *)argp->data, &input, sizeof(input))) {
                ret = -EFAULT;
@@ -815,8 +808,6 @@ e_free_cert:
        kfree(cert_blob);
 e_free_pdh:
        kfree(pdh_blob);
-e_free:
-       kfree(data);
        return ret;
 }
 
@@ -976,6 +967,10 @@ int sev_dev_init(struct psp_device *psp)
        if (!sev)
                goto e_err;
 
+       sev->cmd_buf = (void *)devm_get_free_pages(dev, GFP_KERNEL, 0);
+       if (!sev->cmd_buf)
+               goto e_sev;
+
        psp->sev_data = sev;
 
        sev->dev = dev;
@@ -987,7 +982,7 @@ int sev_dev_init(struct psp_device *psp)
        if (!sev->vdata) {
                ret = -ENODEV;
                dev_err(dev, "sev: missing driver data\n");
-               goto e_err;
+               goto e_buf;
        }
 
        psp_set_sev_irq_handler(psp, sev_irq_handler, sev);
@@ -1002,6 +997,10 @@ int sev_dev_init(struct psp_device *psp)
 
 e_irq:
        psp_clear_sev_irq_handler(psp);
+e_buf:
+       devm_free_pages(dev, (unsigned long)sev->cmd_buf);
+e_sev:
+       devm_kfree(dev, sev);
 e_err:
        psp->sev_data = NULL;
 
index dd5c4fe..666c21e 100644 (file)
@@ -46,12 +46,12 @@ struct sev_device {
        unsigned int int_rcvd;
        wait_queue_head_t int_queue;
        struct sev_misc_dev *misc;
-       struct sev_user_data_status status_cmd_buf;
-       struct sev_data_init init_cmd_buf;
 
        u8 api_major;
        u8 api_minor;
        u8 build;
+
+       void *cmd_buf;
 };
 
 int sev_dev_init(struct psp_device *psp);
index 1b65e72..8895b95 100644 (file)
@@ -192,8 +192,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
                    int len, void *val);
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                            int len, struct kvm_io_device *dev);
-void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-                              struct kvm_io_device *dev);
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+                             struct kvm_io_device *dev);
 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                                         gpa_t addr);
 
@@ -218,6 +218,20 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
+#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
+struct kvm_gfn_range {
+       struct kvm_memory_slot *slot;
+       gfn_t start;
+       gfn_t end;
+       pte_t pte;
+       bool may_block;
+};
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+#endif
+
 enum {
        OUTSIDE_GUEST_MODE,
        IN_GUEST_MODE,
@@ -640,6 +654,7 @@ void kvm_exit(void);
 
 void kvm_get_kvm(struct kvm *kvm);
 void kvm_put_kvm(struct kvm *kvm);
+bool file_is_kvm(struct file *file);
 void kvm_put_kvm_no_destroy(struct kvm *kvm);
 
 static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
@@ -886,7 +901,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot);
 
 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-                                       struct kvm_memory_slot *memslot);
+                                       const struct kvm_memory_slot *memslot);
 #else /* !CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log);
 int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
@@ -945,6 +960,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu);
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
 bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu);
+bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_post_init_vm(struct kvm *kvm);
 void kvm_arch_pre_destroy_vm(struct kvm *kvm);
 
@@ -1116,7 +1132,7 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
 }
 
 static inline unsigned long
-__gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+__gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
 {
        return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
 }
index b801ead..d48a719 100644 (file)
@@ -73,6 +73,7 @@ enum sev_cmd {
        SEV_CMD_SEND_UPDATE_DATA        = 0x041,
        SEV_CMD_SEND_UPDATE_VMSA        = 0x042,
        SEV_CMD_SEND_FINISH             = 0x043,
+       SEV_CMD_SEND_CANCEL             = 0x044,
 
        /* Guest migration commands (incoming) */
        SEV_CMD_RECEIVE_START           = 0x050,
@@ -326,11 +327,11 @@ struct sev_data_send_start {
        u64 pdh_cert_address;                   /* In */
        u32 pdh_cert_len;                       /* In */
        u32 reserved1;
-       u64 plat_cert_address;                  /* In */
-       u32 plat_cert_len;                      /* In */
+       u64 plat_certs_address;                 /* In */
+       u32 plat_certs_len;                     /* In */
        u32 reserved2;
-       u64 amd_cert_address;                   /* In */
-       u32 amd_cert_len;                       /* In */
+       u64 amd_certs_address;                  /* In */
+       u32 amd_certs_len;                      /* In */
        u32 reserved3;
        u64 session_address;                    /* In */
        u32 session_len;                        /* In/Out */
@@ -392,6 +393,15 @@ struct sev_data_send_finish {
        u32 handle;                             /* In */
 } __packed;
 
+/**
+ * struct sev_data_send_cancel - SEND_CANCEL command parameters
+ *
+ * @handle: handle of the VM to process
+ */
+struct sev_data_send_cancel {
+       u32 handle;                             /* In */
+} __packed;
+
 /**
  * struct sev_data_receive_start - RECEIVE_START command parameters
  *
index 49d7d0f..37e1e1a 100644 (file)
@@ -255,30 +255,6 @@ TRACE_EVENT(kvm_fpu,
        TP_printk("%s", __print_symbolic(__entry->load, kvm_fpu_load_symbol))
 );
 
-TRACE_EVENT(kvm_age_page,
-       TP_PROTO(ulong gfn, int level, struct kvm_memory_slot *slot, int ref),
-       TP_ARGS(gfn, level, slot, ref),
-
-       TP_STRUCT__entry(
-               __field(        u64,    hva             )
-               __field(        u64,    gfn             )
-               __field(        u8,     level           )
-               __field(        u8,     referenced      )
-       ),
-
-       TP_fast_assign(
-               __entry->gfn            = gfn;
-               __entry->level          = level;
-               __entry->hva            = ((gfn - slot->base_gfn) <<
-                                           PAGE_SHIFT) + slot->userspace_addr;
-               __entry->referenced     = ref;
-       ),
-
-       TP_printk("hva %llx gfn %llx level %u %s",
-                 __entry->hva, __entry->gfn, __entry->level,
-                 __entry->referenced ? "YOUNG" : "OLD")
-);
-
 #ifdef CONFIG_KVM_ASYNC_PF
 DECLARE_EVENT_CLASS(kvm_async_get_page_class,
 
@@ -462,6 +438,72 @@ TRACE_EVENT(kvm_dirty_ring_exit,
        TP_printk("vcpu %d", __entry->vcpu_id)
 );
 
+TRACE_EVENT(kvm_unmap_hva_range,
+       TP_PROTO(unsigned long start, unsigned long end),
+       TP_ARGS(start, end),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  start           )
+               __field(        unsigned long,  end             )
+       ),
+
+       TP_fast_assign(
+               __entry->start          = start;
+               __entry->end            = end;
+       ),
+
+       TP_printk("mmu notifier unmap range: %#016lx -- %#016lx",
+                 __entry->start, __entry->end)
+);
+
+TRACE_EVENT(kvm_set_spte_hva,
+       TP_PROTO(unsigned long hva),
+       TP_ARGS(hva),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  hva             )
+       ),
+
+       TP_fast_assign(
+               __entry->hva            = hva;
+       ),
+
+       TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva)
+);
+
+TRACE_EVENT(kvm_age_hva,
+       TP_PROTO(unsigned long start, unsigned long end),
+       TP_ARGS(start, end),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  start           )
+               __field(        unsigned long,  end             )
+       ),
+
+       TP_fast_assign(
+               __entry->start          = start;
+               __entry->end            = end;
+       ),
+
+       TP_printk("mmu notifier age hva: %#016lx -- %#016lx",
+                 __entry->start, __entry->end)
+);
+
+TRACE_EVENT(kvm_test_age_hva,
+       TP_PROTO(unsigned long hva),
+       TP_ARGS(hva),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  hva             )
+       ),
+
+       TP_fast_assign(
+               __entry->hva            = hva;
+       ),
+
+       TP_printk("mmu notifier test age hva: %#016lx", __entry->hva)
+);
+
 #endif /* _TRACE_KVM_MAIN_H */
 
 /* This part must be outside protection */
index f6afee2..d765334 100644 (file)
@@ -1078,6 +1078,9 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_DIRTY_LOG_RING 192
 #define KVM_CAP_X86_BUS_LOCK_EXIT 193
 #define KVM_CAP_PPC_DAWR1 194
+#define KVM_CAP_SET_GUEST_DEBUG2 195
+#define KVM_CAP_SGX_ATTRIBUTE 196
+#define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1671,6 +1674,8 @@ enum sev_cmd_id {
        KVM_SEV_CERT_EXPORT,
        /* Attestation report */
        KVM_SEV_GET_ATTESTATION_REPORT,
+       /* Guest Migration Extension */
+       KVM_SEV_SEND_CANCEL,
 
        KVM_SEV_NR_MAX,
 };
@@ -1729,6 +1734,45 @@ struct kvm_sev_attestation_report {
        __u32 len;
 };
 
+struct kvm_sev_send_start {
+       __u32 policy;
+       __u64 pdh_cert_uaddr;
+       __u32 pdh_cert_len;
+       __u64 plat_certs_uaddr;
+       __u32 plat_certs_len;
+       __u64 amd_certs_uaddr;
+       __u32 amd_certs_len;
+       __u64 session_uaddr;
+       __u32 session_len;
+};
+
+struct kvm_sev_send_update_data {
+       __u64 hdr_uaddr;
+       __u32 hdr_len;
+       __u64 guest_uaddr;
+       __u32 guest_len;
+       __u64 trans_uaddr;
+       __u32 trans_len;
+};
+
+struct kvm_sev_receive_start {
+       __u32 handle;
+       __u32 policy;
+       __u64 pdh_uaddr;
+       __u32 pdh_len;
+       __u64 session_uaddr;
+       __u32 session_len;
+};
+
+struct kvm_sev_receive_update_data {
+       __u64 hdr_uaddr;
+       __u32 hdr_len;
+       __u64 guest_uaddr;
+       __u32 guest_len;
+       __u64 trans_uaddr;
+       __u32 trans_len;
+};
+
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU    (1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3         (1 << 1)
 #define KVM_DEV_ASSIGN_MASK_INTX       (1 << 2)
index e4732d3..4f3d5aa 100644 (file)
 #define HUGETLB_FLAG_ENCODE_SHIFT      26
 #define HUGETLB_FLAG_ENCODE_MASK       0x3f
 
+#define HUGETLB_FLAG_ENCODE_16KB       (14 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_64KB       (16 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_512KB      (19 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_1MB                (20 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_2MB                (21 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_8MB                (23 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_16MB       (24 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_32MB       (25 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_256MB      (28 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_512MB      (29 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_1GB                (30 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_2GB                (31 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_16GB       (34 << HUGETLB_FLAG_ENCODE_SHIFT)
index 7bd7e77..34414e8 100644 (file)
@@ -38,6 +38,7 @@
 /dirty_log_perf_test
 /hardware_disable_test
 /kvm_create_max_vcpus
+/kvm_page_table_test
 /memslot_modification_stress_test
 /set_memory_region_test
 /steal_time
index 67eebb5..6b0a9e7 100644 (file)
@@ -72,6 +72,7 @@ TEST_GEN_PROGS_x86_64 += dirty_log_test
 TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
 TEST_GEN_PROGS_x86_64 += hardware_disable_test
 TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_x86_64 += kvm_page_table_test
 TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
 TEST_GEN_PROGS_x86_64 += set_memory_region_test
 TEST_GEN_PROGS_x86_64 += steal_time
@@ -82,6 +83,7 @@ TEST_GEN_PROGS_aarch64 += demand_paging_test
 TEST_GEN_PROGS_aarch64 += dirty_log_test
 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
 TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_aarch64 += kvm_page_table_test
 TEST_GEN_PROGS_aarch64 += set_memory_region_test
 TEST_GEN_PROGS_aarch64 += steal_time
 
@@ -91,6 +93,7 @@ TEST_GEN_PROGS_s390x += s390x/sync_regs_test
 TEST_GEN_PROGS_s390x += demand_paging_test
 TEST_GEN_PROGS_s390x += dirty_log_test
 TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
+TEST_GEN_PROGS_s390x += kvm_page_table_test
 TEST_GEN_PROGS_s390x += set_memory_region_test
 
 TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
index bb2752d..81edbd2 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/bitmap.h>
 #include <linux/bitops.h>
 #include <asm/barrier.h>
+#include <linux/atomic.h>
 
 #include "kvm_util.h"
 #include "test_util.h"
@@ -137,12 +138,20 @@ static uint64_t host_clear_count;
 static uint64_t host_track_next_count;
 
 /* Whether dirty ring reset is requested, or finished */
-static sem_t dirty_ring_vcpu_stop;
-static sem_t dirty_ring_vcpu_cont;
+static sem_t sem_vcpu_stop;
+static sem_t sem_vcpu_cont;
+/*
+ * This is only set by main thread, and only cleared by vcpu thread.  It is
+ * used to request vcpu thread to stop at the next GUEST_SYNC, since GUEST_SYNC
+ * is the only place that we'll guarantee both "dirty bit" and "dirty data"
+ * will match.  E.g., SIG_IPI won't guarantee that if the vcpu is interrupted
+ * after setting dirty bit but before the data is written.
+ */
+static atomic_t vcpu_sync_stop_requested;
 /*
  * This is updated by the vcpu thread to tell the host whether it's a
  * ring-full event.  It should only be read until a sem_wait() of
- * dirty_ring_vcpu_stop and before vcpu continues to run.
+ * sem_vcpu_stop and before vcpu continues to run.
  */
 static bool dirty_ring_vcpu_ring_full;
 /*
@@ -234,6 +243,17 @@ static void clear_log_collect_dirty_pages(struct kvm_vm *vm, int slot,
        kvm_vm_clear_dirty_log(vm, slot, bitmap, 0, num_pages);
 }
 
+/* Should only be called after a GUEST_SYNC */
+static void vcpu_handle_sync_stop(void)
+{
+       if (atomic_read(&vcpu_sync_stop_requested)) {
+               /* It means main thread is sleeping waiting */
+               atomic_set(&vcpu_sync_stop_requested, false);
+               sem_post(&sem_vcpu_stop);
+               sem_wait_until(&sem_vcpu_cont);
+       }
+}
+
 static void default_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
 {
        struct kvm_run *run = vcpu_state(vm, VCPU_ID);
@@ -244,6 +264,8 @@ static void default_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
        TEST_ASSERT(get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC,
                    "Invalid guest sync status: exit_reason=%s\n",
                    exit_reason_str(run->exit_reason));
+
+       vcpu_handle_sync_stop();
 }
 
 static bool dirty_ring_supported(void)
@@ -301,13 +323,13 @@ static void dirty_ring_wait_vcpu(void)
 {
        /* This makes sure that hardware PML cache flushed */
        vcpu_kick();
-       sem_wait_until(&dirty_ring_vcpu_stop);
+       sem_wait_until(&sem_vcpu_stop);
 }
 
 static void dirty_ring_continue_vcpu(void)
 {
        pr_info("Notifying vcpu to continue\n");
-       sem_post(&dirty_ring_vcpu_cont);
+       sem_post(&sem_vcpu_cont);
 }
 
 static void dirty_ring_collect_dirty_pages(struct kvm_vm *vm, int slot,
@@ -361,11 +383,11 @@ static void dirty_ring_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
                /* Update the flag first before pause */
                WRITE_ONCE(dirty_ring_vcpu_ring_full,
                           run->exit_reason == KVM_EXIT_DIRTY_RING_FULL);
-               sem_post(&dirty_ring_vcpu_stop);
+               sem_post(&sem_vcpu_stop);
                pr_info("vcpu stops because %s...\n",
                        dirty_ring_vcpu_ring_full ?
                        "dirty ring is full" : "vcpu is kicked out");
-               sem_wait_until(&dirty_ring_vcpu_cont);
+               sem_wait_until(&sem_vcpu_cont);
                pr_info("vcpu continues now.\n");
        } else {
                TEST_ASSERT(false, "Invalid guest sync status: "
@@ -377,7 +399,7 @@ static void dirty_ring_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
 static void dirty_ring_before_vcpu_join(void)
 {
        /* Kick another round of vcpu just to make sure it will quit */
-       sem_post(&dirty_ring_vcpu_cont);
+       sem_post(&sem_vcpu_cont);
 }
 
 struct log_mode {
@@ -505,9 +527,8 @@ static void *vcpu_worker(void *data)
         */
        sigmask->len = 8;
        pthread_sigmask(0, NULL, sigset);
+       sigdelset(sigset, SIG_IPI);
        vcpu_ioctl(vm, VCPU_ID, KVM_SET_SIGNAL_MASK, sigmask);
-       sigaddset(sigset, SIG_IPI);
-       pthread_sigmask(SIG_BLOCK, sigset, NULL);
 
        sigemptyset(sigset);
        sigaddset(sigset, SIG_IPI);
@@ -768,7 +789,25 @@ static void run_test(enum vm_guest_mode mode, void *arg)
                usleep(p->interval * 1000);
                log_mode_collect_dirty_pages(vm, TEST_MEM_SLOT_INDEX,
                                             bmap, host_num_pages);
+
+               /*
+                * See vcpu_sync_stop_requested definition for details on why
+                * we need to stop vcpu when verify data.
+                */
+               atomic_set(&vcpu_sync_stop_requested, true);
+               sem_wait_until(&sem_vcpu_stop);
+               /*
+                * NOTE: for dirty ring, it's possible that we didn't stop at
+                * GUEST_SYNC but instead we stopped because ring is full;
+                * that's okay too because ring full means we're only missing
+                * the flush of the last page, and since we handle the last
+                * page specially verification will succeed anyway.
+                */
+               assert(host_log_mode == LOG_MODE_DIRTY_RING ||
+                      atomic_read(&vcpu_sync_stop_requested) == false);
                vm_dirty_log_verify(mode, bmap);
+               sem_post(&sem_vcpu_cont);
+
                iteration++;
                sync_global_to_guest(vm, iteration);
        }
@@ -818,9 +857,10 @@ int main(int argc, char *argv[])
                .interval = TEST_HOST_LOOP_INTERVAL,
        };
        int opt, i;
+       sigset_t sigset;
 
-       sem_init(&dirty_ring_vcpu_stop, 0, 0);
-       sem_init(&dirty_ring_vcpu_cont, 0, 0);
+       sem_init(&sem_vcpu_stop, 0, 0);
+       sem_init(&sem_vcpu_cont, 0, 0);
 
        guest_modes_append_default();
 
@@ -876,6 +916,11 @@ int main(int argc, char *argv[])
 
        srandom(time(0));
 
+       /* Ensure that vCPU threads start with SIG_IPI blocked.  */
+       sigemptyset(&sigset);
+       sigaddset(&sigset, SIG_IPI);
+       pthread_sigmask(SIG_BLOCK, &sigset, NULL);
+
        if (host_log_mode_option == LOG_MODE_ALL) {
                /* Run each log mode */
                for (i = 0; i < LOG_MODE_NUM; i++) {
index 0f4258e..0e6cc25 100644 (file)
@@ -69,9 +69,6 @@ enum vm_guest_mode {
 #define MIN_PAGE_SIZE          (1U << MIN_PAGE_SHIFT)
 #define PTES_PER_MIN_PAGE      ptes_per_page(MIN_PAGE_SIZE)
 
-#define vm_guest_mode_string(m) vm_guest_mode_string[m]
-extern const char * const vm_guest_mode_string[];
-
 struct vm_guest_mode_params {
        unsigned int pa_bits;
        unsigned int va_bits;
@@ -85,6 +82,7 @@ int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
 int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
                    struct kvm_enable_cap *cap);
 void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size);
+const char *vm_guest_mode_string(uint32_t i);
 
 struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
 void kvm_vm_free(struct kvm_vm *vmp);
index b7f4139..fade313 100644 (file)
@@ -71,13 +71,32 @@ enum vm_mem_backing_src_type {
        VM_MEM_SRC_ANONYMOUS,
        VM_MEM_SRC_ANONYMOUS_THP,
        VM_MEM_SRC_ANONYMOUS_HUGETLB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB,
+       NUM_SRC_TYPES,
 };
 
 struct vm_mem_backing_src_alias {
        const char *name;
-       enum vm_mem_backing_src_type type;
+       uint32_t flag;
 };
 
+bool thp_configured(void);
+size_t get_trans_hugepagesz(void);
+size_t get_def_hugetlb_pagesz(void);
+const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i);
+size_t get_backing_src_pagesz(uint32_t i);
 void backing_src_help(void);
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
 
diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c
new file mode 100644 (file)
index 0000000..1c4753f
--- /dev/null
@@ -0,0 +1,506 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM page table test
+ *
+ * Copyright (C) 2021, Huawei, Inc.
+ *
+ * Make sure that THP has been enabled or enough HUGETLB pages with specific
+ * page size have been pre-allocated on your system, if you are planning to
+ * use hugepages to back the guest memory for testing.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <pthread.h>
+#include <semaphore.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "guest_modes.h"
+
+#define TEST_MEM_SLOT_INDEX             1
+
+/* Default size(1GB) of the memory for testing */
+#define DEFAULT_TEST_MEM_SIZE          (1 << 30)
+
+/* Default guest test virtual memory offset */
+#define DEFAULT_GUEST_TEST_MEM         0xc0000000
+
+/* Different guest memory accessing stages */
+enum test_stage {
+       KVM_BEFORE_MAPPINGS,
+       KVM_CREATE_MAPPINGS,
+       KVM_UPDATE_MAPPINGS,
+       KVM_ADJUST_MAPPINGS,
+       NUM_TEST_STAGES,
+};
+
+static const char * const test_stage_string[] = {
+       "KVM_BEFORE_MAPPINGS",
+       "KVM_CREATE_MAPPINGS",
+       "KVM_UPDATE_MAPPINGS",
+       "KVM_ADJUST_MAPPINGS",
+};
+
+struct vcpu_args {
+       int vcpu_id;
+       bool vcpu_write;
+};
+
+struct test_args {
+       struct kvm_vm *vm;
+       uint64_t guest_test_virt_mem;
+       uint64_t host_page_size;
+       uint64_t host_num_pages;
+       uint64_t large_page_size;
+       uint64_t large_num_pages;
+       uint64_t host_pages_per_lpage;
+       enum vm_mem_backing_src_type src_type;
+       struct vcpu_args vcpu_args[KVM_MAX_VCPUS];
+};
+
+/*
+ * Guest variables. Use addr_gva2hva() if these variables need
+ * to be changed in host.
+ */
+static enum test_stage guest_test_stage;
+
+/* Host variables */
+static uint32_t nr_vcpus = 1;
+static struct test_args test_args;
+static enum test_stage *current_stage;
+static bool host_quit;
+
+/* Whether the test stage is updated, or completed */
+static sem_t test_stage_updated;
+static sem_t test_stage_completed;
+
+/*
+ * Guest physical memory offset of the testing memory slot.
+ * This will be set to the topmost valid physical address minus
+ * the test memory size.
+ */
+static uint64_t guest_test_phys_mem;
+
+/*
+ * Guest virtual memory offset of the testing memory slot.
+ * Must not conflict with identity mapped test code.
+ */
+static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM;
+
+static void guest_code(int vcpu_id)
+{
+       struct test_args *p = &test_args;
+       struct vcpu_args *vcpu_args = &p->vcpu_args[vcpu_id];
+       enum test_stage *current_stage = &guest_test_stage;
+       uint64_t addr;
+       int i, j;
+
+       /* Make sure vCPU args data structure is not corrupt */
+       GUEST_ASSERT(vcpu_args->vcpu_id == vcpu_id);
+
+       while (true) {
+               addr = p->guest_test_virt_mem;
+
+               switch (READ_ONCE(*current_stage)) {
+               /*
+                * All vCPU threads will be started in this stage,
+                * where guest code of each vCPU will do nothing.
+                */
+               case KVM_BEFORE_MAPPINGS:
+                       break;
+
+               /*
+                * Before dirty logging, vCPUs concurrently access the first
+                * 8 bytes of each page (host page/large page) within the same
+                * memory region with different accessing types (read/write).
+                * Then KVM will create normal page mappings or huge block
+                * mappings for them.
+                */
+               case KVM_CREATE_MAPPINGS:
+                       for (i = 0; i < p->large_num_pages; i++) {
+                               if (vcpu_args->vcpu_write)
+                                       *(uint64_t *)addr = 0x0123456789ABCDEF;
+                               else
+                                       READ_ONCE(*(uint64_t *)addr);
+
+                               addr += p->large_page_size;
+                       }
+                       break;
+
+               /*
+                * During dirty logging, KVM will only update attributes of the
+                * normal page mappings from RO to RW if memory backing src type
+                * is anonymous. In other cases, KVM will split the huge block
+                * mappings into normal page mappings if memory backing src type
+                * is THP or HUGETLB.
+                */
+               case KVM_UPDATE_MAPPINGS:
+                       if (p->src_type == VM_MEM_SRC_ANONYMOUS) {
+                               for (i = 0; i < p->host_num_pages; i++) {
+                                       *(uint64_t *)addr = 0x0123456789ABCDEF;
+                                       addr += p->host_page_size;
+                               }
+                               break;
+                       }
+
+                       for (i = 0; i < p->large_num_pages; i++) {
+                               /*
+                                * Write to the first host page in each large
+                                * page region, and triger break of large pages.
+                                */
+                               *(uint64_t *)addr = 0x0123456789ABCDEF;
+
+                               /*
+                                * Access the middle host pages in each large
+                                * page region. Since dirty logging is enabled,
+                                * this will create new mappings at the smallest
+                                * granularity.
+                                */
+                               addr += p->large_page_size / 2;
+                               for (j = 0; j < p->host_pages_per_lpage / 2; j++) {
+                                       READ_ONCE(*(uint64_t *)addr);
+                                       addr += p->host_page_size;
+                               }
+                       }
+                       break;
+
+               /*
+                * After dirty logging is stopped, vCPUs concurrently read
+                * from every single host page. Then KVM will coalesce the
+                * split page mappings back to block mappings. And a TLB
+                * conflict abort could occur here if TLB entries of the
+                * page mappings are not fully invalidated.
+                */
+               case KVM_ADJUST_MAPPINGS:
+                       for (i = 0; i < p->host_num_pages; i++) {
+                               READ_ONCE(*(uint64_t *)addr);
+                               addr += p->host_page_size;
+                       }
+                       break;
+
+               default:
+                       GUEST_ASSERT(0);
+               }
+
+               GUEST_SYNC(1);
+       }
+}
+
+static void *vcpu_worker(void *data)
+{
+       int ret;
+       struct vcpu_args *vcpu_args = data;
+       struct kvm_vm *vm = test_args.vm;
+       int vcpu_id = vcpu_args->vcpu_id;
+       struct kvm_run *run;
+       struct timespec start;
+       struct timespec ts_diff;
+       enum test_stage stage;
+
+       vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
+       run = vcpu_state(vm, vcpu_id);
+
+       while (!READ_ONCE(host_quit)) {
+               ret = sem_wait(&test_stage_updated);
+               TEST_ASSERT(ret == 0, "Error in sem_wait");
+
+               if (READ_ONCE(host_quit))
+                       return NULL;
+
+               clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+               ret = _vcpu_run(vm, vcpu_id);
+               ts_diff = timespec_elapsed(start);
+
+               TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
+               TEST_ASSERT(get_ucall(vm, vcpu_id, NULL) == UCALL_SYNC,
+                           "Invalid guest sync status: exit_reason=%s\n",
+                           exit_reason_str(run->exit_reason));
+
+               pr_debug("Got sync event from vCPU %d\n", vcpu_id);
+               stage = READ_ONCE(*current_stage);
+
+               /*
+                * Here we can know the execution time of every
+                * single vcpu running in different test stages.
+                */
+               pr_debug("vCPU %d has completed stage %s\n"
+                        "execution time is: %ld.%.9lds\n\n",
+                        vcpu_id, test_stage_string[stage],
+                        ts_diff.tv_sec, ts_diff.tv_nsec);
+
+               ret = sem_post(&test_stage_completed);
+               TEST_ASSERT(ret == 0, "Error in sem_post");
+       }
+
+       return NULL;
+}
+
+struct test_params {
+       uint64_t phys_offset;
+       uint64_t test_mem_size;
+       enum vm_mem_backing_src_type src_type;
+};
+
+static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg)
+{
+       int ret;
+       struct test_params *p = arg;
+       struct vcpu_args *vcpu_args;
+       enum vm_mem_backing_src_type src_type = p->src_type;
+       uint64_t large_page_size = get_backing_src_pagesz(src_type);
+       uint64_t guest_page_size = vm_guest_mode_params[mode].page_size;
+       uint64_t host_page_size = getpagesize();
+       uint64_t test_mem_size = p->test_mem_size;
+       uint64_t guest_num_pages;
+       uint64_t alignment;
+       void *host_test_mem;
+       struct kvm_vm *vm;
+       int vcpu_id;
+
+       /* Align up the test memory size */
+       alignment = max(large_page_size, guest_page_size);
+       test_mem_size = (test_mem_size + alignment - 1) & ~(alignment - 1);
+
+       /* Create a VM with enough guest pages */
+       guest_num_pages = test_mem_size / guest_page_size;
+       vm = vm_create_with_vcpus(mode, nr_vcpus,
+                                 guest_num_pages, 0, guest_code, NULL);
+
+       /* Align down GPA of the testing memslot */
+       if (!p->phys_offset)
+               guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) *
+                                      guest_page_size;
+       else
+               guest_test_phys_mem = p->phys_offset;
+#ifdef __s390x__
+       alignment = max(0x100000, alignment);
+#endif
+       guest_test_phys_mem &= ~(alignment - 1);
+
+       /* Set up the shared data structure test_args */
+       test_args.vm = vm;
+       test_args.guest_test_virt_mem = guest_test_virt_mem;
+       test_args.host_page_size = host_page_size;
+       test_args.host_num_pages = test_mem_size / host_page_size;
+       test_args.large_page_size = large_page_size;
+       test_args.large_num_pages = test_mem_size / large_page_size;
+       test_args.host_pages_per_lpage = large_page_size / host_page_size;
+       test_args.src_type = src_type;
+
+       for (vcpu_id = 0; vcpu_id < KVM_MAX_VCPUS; vcpu_id++) {
+               vcpu_args = &test_args.vcpu_args[vcpu_id];
+               vcpu_args->vcpu_id = vcpu_id;
+               vcpu_args->vcpu_write = !(vcpu_id % 2);
+       }
+
+       /* Add an extra memory slot with specified backing src type */
+       vm_userspace_mem_region_add(vm, src_type, guest_test_phys_mem,
+                                   TEST_MEM_SLOT_INDEX, guest_num_pages, 0);
+
+       /* Do mapping(GVA->GPA) for the testing memory slot */
+       virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0);
+
+       /* Cache the HVA pointer of the region */
+       host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem);
+
+       /* Export shared structure test_args to guest */
+       ucall_init(vm, NULL);
+       sync_global_to_guest(vm, test_args);
+
+       ret = sem_init(&test_stage_updated, 0, 0);
+       TEST_ASSERT(ret == 0, "Error in sem_init");
+
+       ret = sem_init(&test_stage_completed, 0, 0);
+       TEST_ASSERT(ret == 0, "Error in sem_init");
+
+       current_stage = addr_gva2hva(vm, (vm_vaddr_t)(&guest_test_stage));
+       *current_stage = NUM_TEST_STAGES;
+
+       pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
+       pr_info("Testing memory backing src type: %s\n",
+               vm_mem_backing_src_alias(src_type)->name);
+       pr_info("Testing memory backing src granularity: 0x%lx\n",
+               large_page_size);
+       pr_info("Testing memory size(aligned): 0x%lx\n", test_mem_size);
+       pr_info("Guest physical test memory offset: 0x%lx\n",
+               guest_test_phys_mem);
+       pr_info("Host  virtual  test memory offset: 0x%lx\n",
+               (uint64_t)host_test_mem);
+       pr_info("Number of testing vCPUs: %d\n", nr_vcpus);
+
+       return vm;
+}
+
+static void vcpus_complete_new_stage(enum test_stage stage)
+{
+       int ret;
+       int vcpus;
+
+       /* Wake up all the vcpus to run new test stage */
+       for (vcpus = 0; vcpus < nr_vcpus; vcpus++) {
+               ret = sem_post(&test_stage_updated);
+               TEST_ASSERT(ret == 0, "Error in sem_post");
+       }
+       pr_debug("All vcpus have been notified to continue\n");
+
+       /* Wait for all the vcpus to complete new test stage */
+       for (vcpus = 0; vcpus < nr_vcpus; vcpus++) {
+               ret = sem_wait(&test_stage_completed);
+               TEST_ASSERT(ret == 0, "Error in sem_wait");
+
+               pr_debug("%d vcpus have completed stage %s\n",
+                        vcpus + 1, test_stage_string[stage]);
+       }
+
+       pr_debug("All vcpus have completed stage %s\n",
+                test_stage_string[stage]);
+}
+
+static void run_test(enum vm_guest_mode mode, void *arg)
+{
+       int ret;
+       pthread_t *vcpu_threads;
+       struct kvm_vm *vm;
+       int vcpu_id;
+       struct timespec start;
+       struct timespec ts_diff;
+
+       /* Create VM with vCPUs and make some pre-initialization */
+       vm = pre_init_before_test(mode, arg);
+
+       vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
+       TEST_ASSERT(vcpu_threads, "Memory allocation failed");
+
+       host_quit = false;
+       *current_stage = KVM_BEFORE_MAPPINGS;
+
+       for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+               pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
+                              &test_args.vcpu_args[vcpu_id]);
+       }
+
+       vcpus_complete_new_stage(*current_stage);
+       pr_info("Started all vCPUs successfully\n");
+
+       /* Test the stage of KVM creating mappings */
+       *current_stage = KVM_CREATE_MAPPINGS;
+
+       clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+       vcpus_complete_new_stage(*current_stage);
+       ts_diff = timespec_elapsed(start);
+
+       pr_info("KVM_CREATE_MAPPINGS: total execution time: %ld.%.9lds\n\n",
+               ts_diff.tv_sec, ts_diff.tv_nsec);
+
+       /* Test the stage of KVM updating mappings */
+       vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX,
+                               KVM_MEM_LOG_DIRTY_PAGES);
+
+       *current_stage = KVM_UPDATE_MAPPINGS;
+
+       clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+       vcpus_complete_new_stage(*current_stage);
+       ts_diff = timespec_elapsed(start);
+
+       pr_info("KVM_UPDATE_MAPPINGS: total execution time: %ld.%.9lds\n\n",
+               ts_diff.tv_sec, ts_diff.tv_nsec);
+
+       /* Test the stage of KVM adjusting mappings */
+       vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX, 0);
+
+       *current_stage = KVM_ADJUST_MAPPINGS;
+
+       clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+       vcpus_complete_new_stage(*current_stage);
+       ts_diff = timespec_elapsed(start);
+
+       pr_info("KVM_ADJUST_MAPPINGS: total execution time: %ld.%.9lds\n\n",
+               ts_diff.tv_sec, ts_diff.tv_nsec);
+
+       /* Tell the vcpu thread to quit */
+       host_quit = true;
+       for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+               ret = sem_post(&test_stage_updated);
+               TEST_ASSERT(ret == 0, "Error in sem_post");
+       }
+
+       for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++)
+               pthread_join(vcpu_threads[vcpu_id], NULL);
+
+       ret = sem_destroy(&test_stage_updated);
+       TEST_ASSERT(ret == 0, "Error in sem_destroy");
+
+       ret = sem_destroy(&test_stage_completed);
+       TEST_ASSERT(ret == 0, "Error in sem_destroy");
+
+       free(vcpu_threads);
+       ucall_uninit(vm);
+       kvm_vm_free(vm);
+}
+
+static void help(char *name)
+{
+       puts("");
+       printf("usage: %s [-h] [-p offset] [-m mode] "
+              "[-b mem-size] [-v vcpus] [-s mem-type]\n", name);
+       puts("");
+       printf(" -p: specify guest physical test memory offset\n"
+              "     Warning: a low offset can conflict with the loaded test code.\n");
+       guest_modes_help();
+       printf(" -b: specify size of the memory region for testing. e.g. 10M or 3G.\n"
+              "     (default: 1G)\n");
+       printf(" -v: specify the number of vCPUs to run\n"
+              "     (default: 1)\n");
+       printf(" -s: specify the type of memory that should be used to\n"
+              "     back the guest data region.\n"
+              "     (default: anonymous)\n\n");
+       backing_src_help();
+       puts("");
+}
+
+int main(int argc, char *argv[])
+{
+       int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
+       struct test_params p = {
+               .test_mem_size = DEFAULT_TEST_MEM_SIZE,
+               .src_type = VM_MEM_SRC_ANONYMOUS,
+       };
+       int opt;
+
+       guest_modes_append_default();
+
+       while ((opt = getopt(argc, argv, "hp:m:b:v:s:")) != -1) {
+               switch (opt) {
+               case 'p':
+                       p.phys_offset = strtoull(optarg, NULL, 0);
+                       break;
+               case 'm':
+                       guest_modes_cmdline(optarg);
+                       break;
+               case 'b':
+                       p.test_mem_size = parse_size(optarg);
+                       break;
+               case 'v':
+                       nr_vcpus = atoi(optarg);
+                       TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus,
+                                   "Invalid number of vcpus, must be between 1 and %d", max_vcpus);
+                       break;
+               case 's':
+                       p.src_type = parse_backing_src_type(optarg);
+                       break;
+               case 'h':
+               default:
+                       help(argv[0]);
+                       exit(0);
+               }
+       }
+
+       for_each_guest_mode(run_test, &p);
+
+       return 0;
+}
index 5ebbd0d..71ade61 100644 (file)
@@ -71,9 +71,9 @@ test_assert(bool exp, const char *exp_str,
 
                fprintf(stderr, "==== Test Assertion Failure ====\n"
                        "  %s:%u: %s\n"
-                       "  pid=%d tid=%d - %s\n",
+                       "  pid=%d tid=%d errno=%d - %s\n",
                        file, line, exp_str, getpid(), _gettid(),
-                       strerror(errno));
+                       errno, strerror(errno));
                test_dump_stack();
                if (fmt) {
                        fputs("  ", stderr);
index b8849a1..35247db 100644 (file)
@@ -18,7 +18,6 @@
 #include <unistd.h>
 #include <linux/kernel.h>
 
-#define KVM_UTIL_PGS_PER_HUGEPG 512
 #define KVM_UTIL_MIN_PFN       2
 
 static int vcpu_mmap_sz(void);
@@ -143,17 +142,24 @@ static void vm_open(struct kvm_vm *vm, int perm)
                "rc: %i errno: %i", vm->fd, errno);
 }
 
-const char * const vm_guest_mode_string[] = {
-       "PA-bits:52,  VA-bits:48,  4K pages",
-       "PA-bits:52,  VA-bits:48, 64K pages",
-       "PA-bits:48,  VA-bits:48,  4K pages",
-       "PA-bits:48,  VA-bits:48, 64K pages",
-       "PA-bits:40,  VA-bits:48,  4K pages",
-       "PA-bits:40,  VA-bits:48, 64K pages",
-       "PA-bits:ANY, VA-bits:48,  4K pages",
-};
-_Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES,
-              "Missing new mode strings?");
+const char *vm_guest_mode_string(uint32_t i)
+{
+       static const char * const strings[] = {
+               [VM_MODE_P52V48_4K]     = "PA-bits:52,  VA-bits:48,  4K pages",
+               [VM_MODE_P52V48_64K]    = "PA-bits:52,  VA-bits:48, 64K pages",
+               [VM_MODE_P48V48_4K]     = "PA-bits:48,  VA-bits:48,  4K pages",
+               [VM_MODE_P48V48_64K]    = "PA-bits:48,  VA-bits:48, 64K pages",
+               [VM_MODE_P40V48_4K]     = "PA-bits:40,  VA-bits:48,  4K pages",
+               [VM_MODE_P40V48_64K]    = "PA-bits:40,  VA-bits:48, 64K pages",
+               [VM_MODE_PXXV48_4K]     = "PA-bits:ANY, VA-bits:48,  4K pages",
+       };
+       _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
+                      "Missing new mode strings?");
+
+       TEST_ASSERT(i < NUM_VM_MODES, "Guest mode ID %d too big", i);
+
+       return strings[i];
+}
 
 const struct vm_guest_mode_params vm_guest_mode_params[] = {
        { 52, 48,  0x1000, 12 },
@@ -681,7 +687,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 {
        int ret;
        struct userspace_mem_region *region;
-       size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size;
+       size_t backing_src_pagesz = get_backing_src_pagesz(src_type);
        size_t alignment;
 
        TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages,
@@ -743,7 +749,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 #endif
 
        if (src_type == VM_MEM_SRC_ANONYMOUS_THP)
-               alignment = max(huge_page_size, alignment);
+               alignment = max(backing_src_pagesz, alignment);
 
        /* Add enough memory to align up if necessary */
        if (alignment > 1)
@@ -752,7 +758,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
        region->mmap_start = mmap(NULL, region->mmap_size,
                                  PROT_READ | PROT_WRITE,
                                  MAP_PRIVATE | MAP_ANONYMOUS
-                                 | (src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB ? MAP_HUGETLB : 0),
+                                 | vm_mem_backing_src_alias(src_type)->flag,
                                  -1, 0);
        TEST_ASSERT(region->mmap_start != MAP_FAILED,
                    "test_malloc failed, mmap_start: %p errno: %i",
@@ -762,22 +768,13 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
        region->host_mem = align(region->mmap_start, alignment);
 
        /* As needed perform madvise */
-       if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == VM_MEM_SRC_ANONYMOUS_THP) {
-               struct stat statbuf;
-
-               ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
-               TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
-                           "stat /sys/kernel/mm/transparent_hugepage");
-
-               TEST_ASSERT(ret == 0 || src_type != VM_MEM_SRC_ANONYMOUS_THP,
-                           "VM_MEM_SRC_ANONYMOUS_THP requires THP to be configured in the host kernel");
-
-               if (ret == 0) {
-                       ret = madvise(region->host_mem, npages * vm->page_size,
-                                     src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
-                       TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %x",
-                                   region->host_mem, npages * vm->page_size, src_type);
-               }
+       if ((src_type == VM_MEM_SRC_ANONYMOUS ||
+            src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) {
+               ret = madvise(region->host_mem, npages * vm->page_size,
+                             src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
+               TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %s",
+                           region->host_mem, npages * vm->page_size,
+                           vm_mem_backing_src_alias(src_type)->name);
        }
 
        region->unused_phy_pages = sparsebit_alloc();
index 906c955..63d2bc7 100644 (file)
@@ -10,6 +10,8 @@
 #include <limits.h>
 #include <stdlib.h>
 #include <time.h>
+#include <sys/stat.h>
+#include <linux/mman.h>
 #include "linux/kernel.h"
 
 #include "test_util.h"
@@ -111,28 +113,169 @@ void print_skip(const char *fmt, ...)
        puts(", skipping test");
 }
 
-const struct vm_mem_backing_src_alias backing_src_aliases[] = {
-       {"anonymous", VM_MEM_SRC_ANONYMOUS,},
-       {"anonymous_thp", VM_MEM_SRC_ANONYMOUS_THP,},
-       {"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,},
-};
+bool thp_configured(void)
+{
+       int ret;
+       struct stat statbuf;
+
+       ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
+       TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
+                   "Error in stating /sys/kernel/mm/transparent_hugepage");
+
+       return ret == 0;
+}
+
+size_t get_trans_hugepagesz(void)
+{
+       size_t size;
+       FILE *f;
+
+       TEST_ASSERT(thp_configured(), "THP is not configured in host kernel");
+
+       f = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r");
+       TEST_ASSERT(f != NULL, "Error in opening transparent_hugepage/hpage_pmd_size");
+
+       fscanf(f, "%ld", &size);
+       fclose(f);
+
+       return size;
+}
+
+size_t get_def_hugetlb_pagesz(void)
+{
+       char buf[64];
+       const char *tag = "Hugepagesize:";
+       FILE *f;
+
+       f = fopen("/proc/meminfo", "r");
+       TEST_ASSERT(f != NULL, "Error in opening /proc/meminfo");
+
+       while (fgets(buf, sizeof(buf), f) != NULL) {
+               if (strstr(buf, tag) == buf) {
+                       fclose(f);
+                       return strtoull(buf + strlen(tag), NULL, 10) << 10;
+               }
+       }
+
+       if (feof(f))
+               TEST_FAIL("HUGETLB is not configured in host kernel");
+       else
+               TEST_FAIL("Error in reading /proc/meminfo");
+
+       fclose(f);
+       return 0;
+}
+
+const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i)
+{
+       static const struct vm_mem_backing_src_alias aliases[] = {
+               [VM_MEM_SRC_ANONYMOUS] = {
+                       .name = "anonymous",
+                       .flag = 0,
+               },
+               [VM_MEM_SRC_ANONYMOUS_THP] = {
+                       .name = "anonymous_thp",
+                       .flag = 0,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB] = {
+                       .name = "anonymous_hugetlb",
+                       .flag = MAP_HUGETLB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB] = {
+                       .name = "anonymous_hugetlb_16kb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_16KB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB] = {
+                       .name = "anonymous_hugetlb_64kb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_64KB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB] = {
+                       .name = "anonymous_hugetlb_512kb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_512KB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB] = {
+                       .name = "anonymous_hugetlb_1mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_1MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB] = {
+                       .name = "anonymous_hugetlb_2mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_2MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB] = {
+                       .name = "anonymous_hugetlb_8mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_8MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB] = {
+                       .name = "anonymous_hugetlb_16mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_16MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB] = {
+                       .name = "anonymous_hugetlb_32mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_32MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB] = {
+                       .name = "anonymous_hugetlb_256mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_256MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB] = {
+                       .name = "anonymous_hugetlb_512mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_512MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB] = {
+                       .name = "anonymous_hugetlb_1gb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_1GB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB] = {
+                       .name = "anonymous_hugetlb_2gb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_2GB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB] = {
+                       .name = "anonymous_hugetlb_16gb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_16GB,
+               },
+       };
+       _Static_assert(ARRAY_SIZE(aliases) == NUM_SRC_TYPES,
+                      "Missing new backing src types?");
+
+       TEST_ASSERT(i < NUM_SRC_TYPES, "Backing src type ID %d too big", i);
+
+       return &aliases[i];
+}
+
+#define MAP_HUGE_PAGE_SIZE(x) (1ULL << ((x >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK))
+
+size_t get_backing_src_pagesz(uint32_t i)
+{
+       uint32_t flag = vm_mem_backing_src_alias(i)->flag;
+
+       switch (i) {
+       case VM_MEM_SRC_ANONYMOUS:
+               return getpagesize();
+       case VM_MEM_SRC_ANONYMOUS_THP:
+               return get_trans_hugepagesz();
+       case VM_MEM_SRC_ANONYMOUS_HUGETLB:
+               return get_def_hugetlb_pagesz();
+       default:
+               return MAP_HUGE_PAGE_SIZE(flag);
+       }
+}
 
 void backing_src_help(void)
 {
        int i;
 
        printf("Available backing src types:\n");
-       for (i = 0; i < ARRAY_SIZE(backing_src_aliases); i++)
-               printf("\t%s\n", backing_src_aliases[i].name);
+       for (i = 0; i < NUM_SRC_TYPES; i++)
+               printf("\t%s\n", vm_mem_backing_src_alias(i)->name);
 }
 
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name)
 {
        int i;
 
-       for (i = 0; i < ARRAY_SIZE(backing_src_aliases); i++)
-               if (!strcmp(type_name, backing_src_aliases[i].name))
-                       return backing_src_aliases[i].type;
+       for (i = 0; i < NUM_SRC_TYPES; i++)
+               if (!strcmp(type_name, vm_mem_backing_src_alias(i)->name))
+                       return i;
 
        backing_src_help();
        TEST_FAIL("Unknown backing src type: %s", type_name);
index 804ff5f..1f4a059 100644 (file)
@@ -186,7 +186,7 @@ int main(int argc, char *argv[])
                vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &st);
        }
 
-       struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);;
+       struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);
        rs->state = 0x5a;
 
        for (;;) {
index 592c1cc..0bd7342 100644 (file)
@@ -14,7 +14,7 @@
 #define __aligned(x) __attribute__((__aligned__(x)))
 #define __packed __attribute__((packed))
 
-#include "../../../../arch/x86/kernel/cpu/sgx/arch.h"
+#include "../../../../arch/x86/include/asm/sgx.h"
 #include "../../../../arch/x86/include/asm/enclu.h"
 #include "../../../../arch/x86/include/uapi/asm/sgx.h"
 
index 9d43b75..f441ac3 100644 (file)
@@ -45,19 +45,19 @@ static bool encl_map_bin(const char *path, struct encl *encl)
 
        fd = open(path, O_RDONLY);
        if (fd == -1)  {
-               perror("open()");
+               perror("enclave executable open()");
                return false;
        }
 
        ret = stat(path, &sb);
        if (ret) {
-               perror("stat()");
+               perror("enclave executable stat()");
                goto err;
        }
 
        bin = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
        if (bin == MAP_FAILED) {
-               perror("mmap()");
+               perror("enclave executable mmap()");
                goto err;
        }
 
@@ -90,8 +90,7 @@ static bool encl_ioc_create(struct encl *encl)
        ioc.src = (unsigned long)secs;
        rc = ioctl(encl->fd, SGX_IOC_ENCLAVE_CREATE, &ioc);
        if (rc) {
-               fprintf(stderr, "SGX_IOC_ENCLAVE_CREATE failed: errno=%d\n",
-                       errno);
+               perror("SGX_IOC_ENCLAVE_CREATE failed");
                munmap((void *)secs->base, encl->encl_size);
                return false;
        }
@@ -116,31 +115,72 @@ static bool encl_ioc_add_pages(struct encl *encl, struct encl_segment *seg)
 
        rc = ioctl(encl->fd, SGX_IOC_ENCLAVE_ADD_PAGES, &ioc);
        if (rc < 0) {
-               fprintf(stderr, "SGX_IOC_ENCLAVE_ADD_PAGES failed: errno=%d.\n",
-                       errno);
+               perror("SGX_IOC_ENCLAVE_ADD_PAGES failed");
                return false;
        }
 
        return true;
 }
 
+
+
 bool encl_load(const char *path, struct encl *encl)
 {
+       const char device_path[] = "/dev/sgx_enclave";
        Elf64_Phdr *phdr_tbl;
        off_t src_offset;
        Elf64_Ehdr *ehdr;
+       struct stat sb;
+       void *ptr;
        int i, j;
        int ret;
+       int fd = -1;
 
        memset(encl, 0, sizeof(*encl));
 
-       ret = open("/dev/sgx_enclave", O_RDWR);
-       if (ret < 0) {
-               fprintf(stderr, "Unable to open /dev/sgx_enclave\n");
+       fd = open(device_path, O_RDWR);
+       if (fd < 0) {
+               perror("Unable to open /dev/sgx_enclave");
+               goto err;
+       }
+
+       ret = stat(device_path, &sb);
+       if (ret) {
+               perror("device file stat()");
+               goto err;
+       }
+
+       /*
+        * This just checks if the /dev file has these permission
+        * bits set.  It does not check that the current user is
+        * the owner or in the owning group.
+        */
+       if (!(sb.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) {
+               fprintf(stderr, "no execute permissions on device file %s\n", device_path);
+               goto err;
+       }
+
+       ptr = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED, fd, 0);
+       if (ptr == (void *)-1) {
+               perror("mmap for read");
+               goto err;
+       }
+       munmap(ptr, PAGE_SIZE);
+
+#define ERR_MSG \
+"mmap() succeeded for PROT_READ, but failed for PROT_EXEC.\n" \
+" Check that current user has execute permissions on %s and \n" \
+" that /dev does not have noexec set: mount | grep \"/dev .*noexec\"\n" \
+" If so, remount it executable: mount -o remount,exec /dev\n\n"
+
+       ptr = mmap(NULL, PAGE_SIZE, PROT_EXEC, MAP_SHARED, fd, 0);
+       if (ptr == (void *)-1) {
+               fprintf(stderr, ERR_MSG, device_path);
                goto err;
        }
+       munmap(ptr, PAGE_SIZE);
 
-       encl->fd = ret;
+       encl->fd = fd;
 
        if (!encl_map_bin(path, encl))
                goto err;
@@ -217,6 +257,8 @@ bool encl_load(const char *path, struct encl *encl)
        return true;
 
 err:
+       if (fd != -1)
+               close(fd);
        encl_delete(encl);
        return false;
 }
@@ -229,7 +271,7 @@ static bool encl_map_area(struct encl *encl)
        area = mmap(NULL, encl_size * 2, PROT_NONE,
                    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        if (area == MAP_FAILED) {
-               perror("mmap");
+               perror("reservation mmap()");
                return false;
        }
 
@@ -268,8 +310,7 @@ bool encl_build(struct encl *encl)
        ioc.sigstruct = (uint64_t)&encl->sigstruct;
        ret = ioctl(encl->fd, SGX_IOC_ENCLAVE_INIT, &ioc);
        if (ret) {
-               fprintf(stderr, "SGX_IOC_ENCLAVE_INIT failed: errno=%d\n",
-                       errno);
+               perror("SGX_IOC_ENCLAVE_INIT failed");
                return false;
        }
 
index 724cec7..d304a40 100644 (file)
@@ -15,6 +15,7 @@
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/types.h>
+#include <sys/auxv.h>
 #include "defines.h"
 #include "main.h"
 #include "../kselftest.h"
@@ -28,24 +29,6 @@ struct vdso_symtab {
        Elf64_Word *elf_hashtab;
 };
 
-static void *vdso_get_base_addr(char *envp[])
-{
-       Elf64_auxv_t *auxv;
-       int i;
-
-       for (i = 0; envp[i]; i++)
-               ;
-
-       auxv = (Elf64_auxv_t *)&envp[i + 1];
-
-       for (i = 0; auxv[i].a_type != AT_NULL; i++) {
-               if (auxv[i].a_type == AT_SYSINFO_EHDR)
-                       return (void *)auxv[i].a_un.a_val;
-       }
-
-       return NULL;
-}
-
 static Elf64_Dyn *vdso_get_dyntab(void *addr)
 {
        Elf64_Ehdr *ehdr = addr;
@@ -162,7 +145,7 @@ static int user_handler(long rdi, long rsi, long rdx, long ursp, long r8, long r
        return 0;
 }
 
-int main(int argc, char *argv[], char *envp[])
+int main(int argc, char *argv[])
 {
        struct sgx_enclave_run run;
        struct vdso_symtab symtab;
@@ -195,7 +178,7 @@ int main(int argc, char *argv[], char *envp[])
                addr = mmap((void *)encl.encl_base + seg->offset, seg->size,
                            seg->prot, MAP_SHARED | MAP_FIXED, encl.fd, 0);
                if (addr == MAP_FAILED) {
-                       fprintf(stderr, "mmap() failed, errno=%d.\n", errno);
+                       perror("mmap() segment failed");
                        exit(KSFT_FAIL);
                }
        }
@@ -203,7 +186,8 @@ int main(int argc, char *argv[], char *envp[])
        memset(&run, 0, sizeof(run));
        run.tcs = encl.encl_base;
 
-       addr = vdso_get_base_addr(envp);
+       /* Get vDSO base address */
+       addr = (void *)getauxval(AT_SYSINFO_EHDR);
        if (!addr)
                goto err;
 
index 62bd908..f08f5e8 100644 (file)
@@ -174,21 +174,36 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
                                           struct kvm_coalesced_mmio_zone *zone)
 {
        struct kvm_coalesced_mmio_dev *dev, *tmp;
+       int r;
 
        if (zone->pio != 1 && zone->pio != 0)
                return -EINVAL;
 
        mutex_lock(&kvm->slots_lock);
 
-       list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list)
+       list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list) {
                if (zone->pio == dev->zone.pio &&
                    coalesced_mmio_in_range(dev, zone->addr, zone->size)) {
-                       kvm_io_bus_unregister_dev(kvm,
+                       r = kvm_io_bus_unregister_dev(kvm,
                                zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS, &dev->dev);
                        kvm_iodevice_destructor(&dev->dev);
+
+                       /*
+                        * On failure, unregister destroys all devices on the
+                        * bus _except_ the target device, i.e. coalesced_zones
+                        * has been modified.  No need to restart the walk as
+                        * there aren't any zones left.
+                        */
+                       if (r)
+                               break;
                }
+       }
 
        mutex_unlock(&kvm->slots_lock);
 
+       /*
+        * Ignore the result of kvm_io_bus_unregister_dev(), from userspace's
+        * perspective, the coalesced MMIO is most definitely unregistered.
+        */
        return 0;
 }
index 383df23..2799c66 100644 (file)
@@ -451,35 +451,170 @@ static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
        srcu_read_unlock(&kvm->srcu, idx);
 }
 
-static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
-                                       struct mm_struct *mm,
-                                       unsigned long address,
-                                       pte_t pte)
+typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
+
+typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
+                            unsigned long end);
+
+struct kvm_hva_range {
+       unsigned long start;
+       unsigned long end;
+       pte_t pte;
+       hva_handler_t handler;
+       on_lock_fn_t on_lock;
+       bool flush_on_ret;
+       bool may_block;
+};
+
+/*
+ * Use a dedicated stub instead of NULL to indicate that there is no callback
+ * function/handler.  The compiler technically can't guarantee that a real
+ * function will have a non-zero address, and so it will generate code to
+ * check for !NULL, whereas comparing against a stub will be elided at compile
+ * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
+ */
+static void kvm_null_fn(void)
 {
-       struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int idx;
+
+}
+#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
+
+static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
+                                                 const struct kvm_hva_range *range)
+{
+       bool ret = false, locked = false;
+       struct kvm_gfn_range gfn_range;
+       struct kvm_memory_slot *slot;
+       struct kvm_memslots *slots;
+       int i, idx;
+
+       /* A null handler is allowed if and only if on_lock() is provided. */
+       if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
+                        IS_KVM_NULL_FN(range->handler)))
+               return 0;
 
        idx = srcu_read_lock(&kvm->srcu);
 
-       KVM_MMU_LOCK(kvm);
+       /* The on_lock() path does not yet support lock elision. */
+       if (!IS_KVM_NULL_FN(range->on_lock)) {
+               locked = true;
+               KVM_MMU_LOCK(kvm);
 
-       kvm->mmu_notifier_seq++;
+               range->on_lock(kvm, range->start, range->end);
+
+               if (IS_KVM_NULL_FN(range->handler))
+                       goto out_unlock;
+       }
 
-       if (kvm_set_spte_hva(kvm, address, pte))
+       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+               slots = __kvm_memslots(kvm, i);
+               kvm_for_each_memslot(slot, slots) {
+                       unsigned long hva_start, hva_end;
+
+                       hva_start = max(range->start, slot->userspace_addr);
+                       hva_end = min(range->end, slot->userspace_addr +
+                                                 (slot->npages << PAGE_SHIFT));
+                       if (hva_start >= hva_end)
+                               continue;
+
+                       /*
+                        * To optimize for the likely case where the address
+                        * range is covered by zero or one memslots, don't
+                        * bother making these conditional (to avoid writes on
+                        * the second or later invocation of the handler).
+                        */
+                       gfn_range.pte = range->pte;
+                       gfn_range.may_block = range->may_block;
+
+                       /*
+                        * {gfn(page) | page intersects with [hva_start, hva_end)} =
+                        * {gfn_start, gfn_start+1, ..., gfn_end-1}.
+                        */
+                       gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
+                       gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
+                       gfn_range.slot = slot;
+
+                       if (!locked) {
+                               locked = true;
+                               KVM_MMU_LOCK(kvm);
+                       }
+                       ret |= range->handler(kvm, &gfn_range);
+               }
+       }
+
+       if (range->flush_on_ret && (ret || kvm->tlbs_dirty))
                kvm_flush_remote_tlbs(kvm);
 
-       KVM_MMU_UNLOCK(kvm);
+out_unlock:
+       if (locked)
+               KVM_MMU_UNLOCK(kvm);
+
        srcu_read_unlock(&kvm->srcu, idx);
+
+       /* The notifiers are averse to booleans. :-( */
+       return (int)ret;
 }
 
-static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
-                                       const struct mmu_notifier_range *range)
+static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
+                                               unsigned long start,
+                                               unsigned long end,
+                                               pte_t pte,
+                                               hva_handler_t handler)
 {
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int need_tlb_flush = 0, idx;
+       const struct kvm_hva_range range = {
+               .start          = start,
+               .end            = end,
+               .pte            = pte,
+               .handler        = handler,
+               .on_lock        = (void *)kvm_null_fn,
+               .flush_on_ret   = true,
+               .may_block      = false,
+       };
+
+       return __kvm_handle_hva_range(kvm, &range);
+}
+
+static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
+                                                        unsigned long start,
+                                                        unsigned long end,
+                                                        hva_handler_t handler)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       const struct kvm_hva_range range = {
+               .start          = start,
+               .end            = end,
+               .pte            = __pte(0),
+               .handler        = handler,
+               .on_lock        = (void *)kvm_null_fn,
+               .flush_on_ret   = false,
+               .may_block      = false,
+       };
+
+       return __kvm_handle_hva_range(kvm, &range);
+}
+static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
+                                       struct mm_struct *mm,
+                                       unsigned long address,
+                                       pte_t pte)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+
+       trace_kvm_set_spte_hva(address);
 
-       idx = srcu_read_lock(&kvm->srcu);
-       KVM_MMU_LOCK(kvm);
+       /*
+        * .change_pte() must be surrounded by .invalidate_range_{start,end}(),
+        * and so always runs with an elevated notifier count.  This obviates
+        * the need to bump the sequence count.
+        */
+       WARN_ON_ONCE(!kvm->mmu_notifier_count);
+
+       kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
+}
+
+static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
+                                  unsigned long end)
+{
        /*
         * The count increase must become visible at unlock time as no
         * spte can be established without taking the mmu_lock and
@@ -487,8 +622,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
         */
        kvm->mmu_notifier_count++;
        if (likely(kvm->mmu_notifier_count == 1)) {
-               kvm->mmu_notifier_range_start = range->start;
-               kvm->mmu_notifier_range_end = range->end;
+               kvm->mmu_notifier_range_start = start;
+               kvm->mmu_notifier_range_end = end;
        } else {
                /*
                 * Fully tracking multiple concurrent ranges has dimishing
@@ -500,28 +635,36 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
                 * complete.
                 */
                kvm->mmu_notifier_range_start =
-                       min(kvm->mmu_notifier_range_start, range->start);
+                       min(kvm->mmu_notifier_range_start, start);
                kvm->mmu_notifier_range_end =
-                       max(kvm->mmu_notifier_range_end, range->end);
+                       max(kvm->mmu_notifier_range_end, end);
        }
-       need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
-                                            range->flags);
-       /* we've to flush the tlb before the pages can be freed */
-       if (need_tlb_flush || kvm->tlbs_dirty)
-               kvm_flush_remote_tlbs(kvm);
-
-       KVM_MMU_UNLOCK(kvm);
-       srcu_read_unlock(&kvm->srcu, idx);
-
-       return 0;
 }
 
-static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
                                        const struct mmu_notifier_range *range)
 {
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       const struct kvm_hva_range hva_range = {
+               .start          = range->start,
+               .end            = range->end,
+               .pte            = __pte(0),
+               .handler        = kvm_unmap_gfn_range,
+               .on_lock        = kvm_inc_notifier_count,
+               .flush_on_ret   = true,
+               .may_block      = mmu_notifier_range_blockable(range),
+       };
 
-       KVM_MMU_LOCK(kvm);
+       trace_kvm_unmap_hva_range(range->start, range->end);
+
+       __kvm_handle_hva_range(kvm, &hva_range);
+
+       return 0;
+}
+
+static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
+                                  unsigned long end)
+{
        /*
         * This sequence increase will notify the kvm page fault that
         * the page that is going to be mapped in the spte could have
@@ -535,7 +678,23 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
         * in conjunction with the smp_rmb in mmu_notifier_retry().
         */
        kvm->mmu_notifier_count--;
-       KVM_MMU_UNLOCK(kvm);
+}
+
+static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+                                       const struct mmu_notifier_range *range)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       const struct kvm_hva_range hva_range = {
+               .start          = range->start,
+               .end            = range->end,
+               .pte            = __pte(0),
+               .handler        = (void *)kvm_null_fn,
+               .on_lock        = kvm_dec_notifier_count,
+               .flush_on_ret   = false,
+               .may_block      = mmu_notifier_range_blockable(range),
+       };
+
+       __kvm_handle_hva_range(kvm, &hva_range);
 
        BUG_ON(kvm->mmu_notifier_count < 0);
 }
@@ -545,20 +704,9 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
                                              unsigned long start,
                                              unsigned long end)
 {
-       struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int young, idx;
+       trace_kvm_age_hva(start, end);
 
-       idx = srcu_read_lock(&kvm->srcu);
-       KVM_MMU_LOCK(kvm);
-
-       young = kvm_age_hva(kvm, start, end);
-       if (young)
-               kvm_flush_remote_tlbs(kvm);
-
-       KVM_MMU_UNLOCK(kvm);
-       srcu_read_unlock(&kvm->srcu, idx);
-
-       return young;
+       return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
 }
 
 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
@@ -566,11 +714,8 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
                                        unsigned long start,
                                        unsigned long end)
 {
-       struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int young, idx;
+       trace_kvm_age_hva(start, end);
 
-       idx = srcu_read_lock(&kvm->srcu);
-       KVM_MMU_LOCK(kvm);
        /*
         * Even though we do not flush TLB, this will still adversely
         * affect performance on pre-Haswell Intel EPT, where there is
@@ -584,27 +729,17 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
         * cadence. If we find this inaccurate, we might come up with a
         * more sophisticated heuristic later.
         */
-       young = kvm_age_hva(kvm, start, end);
-       KVM_MMU_UNLOCK(kvm);
-       srcu_read_unlock(&kvm->srcu, idx);
-
-       return young;
+       return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
 }
 
 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
                                       struct mm_struct *mm,
                                       unsigned long address)
 {
-       struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int young, idx;
+       trace_kvm_test_age_hva(address);
 
-       idx = srcu_read_lock(&kvm->srcu);
-       KVM_MMU_LOCK(kvm);
-       young = kvm_test_age_hva(kvm, address);
-       KVM_MMU_UNLOCK(kvm);
-       srcu_read_unlock(&kvm->srcu, idx);
-
-       return young;
+       return kvm_handle_hva_range_no_flush(mn, address, address + 1,
+                                            kvm_test_age_gfn);
 }
 
 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
@@ -3002,6 +3137,11 @@ static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
        return false;
 }
 
+bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+       return false;
+}
+
 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 {
        struct kvm *kvm = me->kvm;
@@ -3035,7 +3175,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
                            !vcpu_dy_runnable(vcpu))
                                continue;
                        if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
-                               !kvm_arch_vcpu_in_kernel(vcpu))
+                           !kvm_arch_dy_has_pending_interrupt(vcpu) &&
+                           !kvm_arch_vcpu_in_kernel(vcpu))
                                continue;
                        if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
                                continue;
@@ -3182,7 +3323,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
        if (r)
                goto vcpu_decrement;
 
-       vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+       vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
        if (!vcpu) {
                r = -ENOMEM;
                goto vcpu_decrement;
@@ -4062,6 +4203,12 @@ static struct file_operations kvm_vm_fops = {
        KVM_COMPAT(kvm_vm_compat_ioctl),
 };
 
+bool file_is_kvm(struct file *file)
+{
+       return file && file->f_op == &kvm_vm_fops;
+}
+EXPORT_SYMBOL_GPL(file_is_kvm);
+
 static int kvm_dev_ioctl_create_vm(unsigned long type)
 {
        int r;
@@ -4485,24 +4632,26 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
        return 0;
 }
 
-/* Caller must hold slots_lock. */
-void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-                              struct kvm_io_device *dev)
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+                             struct kvm_io_device *dev)
 {
        int i, j;
        struct kvm_io_bus *new_bus, *bus;
 
+       lockdep_assert_held(&kvm->slots_lock);
+
        bus = kvm_get_bus(kvm, bus_idx);
        if (!bus)
-               return;
+               return 0;
 
-       for (i = 0; i < bus->dev_count; i++)
+       for (i = 0; i < bus->dev_count; i++) {
                if (bus->range[i].dev == dev) {
                        break;
                }
+       }
 
        if (i == bus->dev_count)
-               return;
+               return 0;
 
        new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
                          GFP_KERNEL_ACCOUNT);
@@ -4511,7 +4660,13 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                new_bus->dev_count--;
                memcpy(new_bus->range + i, bus->range + i + 1,
                                flex_array_size(new_bus, range, new_bus->dev_count - i));
-       } else {
+       }
+
+       rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+       synchronize_srcu_expedited(&kvm->srcu);
+
+       /* Destroy the old bus _after_ installing the (null) bus. */
+       if (!new_bus) {
                pr_err("kvm: failed to shrink bus, removing it completely\n");
                for (j = 0; j < bus->dev_count; j++) {
                        if (j == i)
@@ -4520,10 +4675,8 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                }
        }
 
-       rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
-       synchronize_srcu_expedited(&kvm->srcu);
        kfree(bus);
-       return;
+       return new_bus ? 0 : -ENOMEM;
 }
 
 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,