From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 7 Sep 2021 20:40:51 +0000 (-0700)
Subject: Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
X-Git-Tag: microblaze-v5.16~66
X-Git-Url: http://git.monstr.eu/?p=linux-2.6-microblaze.git;a=commitdiff_plain;h=192ad3c27a4895ee4b2fa31c5b54a932f5bb08c1;hp=-c

Merge tag 'for-linus' of git://git./virt/kvm/kvm

Pull KVM updates from Paolo Bonzini:
 "ARM:
   - Page ownership tracking between host EL1 and EL2
   - Rely on userspace page tables to create large stage-2 mappings
   - Fix incompatibility between pKVM and kmemleak
   - Fix the PMU reset state, and improve the performance of the virtual
     PMU
   - Move over to the generic KVM entry code
   - Address PSCI reset issues w.r.t. save/restore
   - Preliminary rework for the upcoming pKVM fixed feature
   - A bunch of MM cleanups
   - a vGIC fix for timer spurious interrupts
   - Various cleanups

  s390:
   - enable interpretation of specification exceptions
   - fix a vcpu_idx vs vcpu_id mixup

  x86:
   - fast (lockless) page fault support for the new MMU
   - new MMU now the default
   - increased maximum allowed VCPU count
   - allow inhibit IRQs on KVM_RUN while debugging guests
   - let Hyper-V-enabled guests run with virtualized LAPIC as long as
     they do not enable the Hyper-V "AutoEOI" feature
   - fixes and optimizations for the toggling of AMD AVIC (virtualized
     LAPIC)
   - tuning for the case when two-dimensional paging (EPT/NPT) is
     disabled
   - bugfixes and cleanups, especially with respect to vCPU reset and
     choosing a paging mode based on CR0/CR4/EFER
   - support for 5-level page table on AMD processors

  Generic:
   - MMU notifier invalidation callbacks do not take mmu_lock unless
     necessary
   - improved caching of LRU kvm_memory_slot
   - support for histogram statistics
   - add statistics for halt polling and remote TLB flush requests"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (210 commits)
  KVM: Drop unused kvm_dirty_gfn_invalid()
  KVM: x86: Update vCPU's hv_clock before back to guest when tsc_offset is adjusted
  KVM: MMU: mark role_regs and role accessors as maybe unused
  KVM: MIPS: Remove a "set but not used" variable
  x86/kvm: Don't enable IRQ when IRQ enabled in kvm_wait
  KVM: stats: Add VM stat for remote tlb flush requests
  KVM: Remove unnecessary export of kvm_{inc,dec}_notifier_count()
  KVM: x86/mmu: Move lpage_disallowed_link further "down" in kvm_mmu_page
  KVM: x86/mmu: Relocate kvm_mmu_page.tdp_mmu_page for better cache locality
  Revert "KVM: x86: mmu: Add guest physical address check in translate_gpa()"
  KVM: x86/mmu: Remove unused field mmio_cached in struct kvm_mmu_page
  kvm: x86: Increase KVM_SOFT_MAX_VCPUS to 710
  kvm: x86: Increase MAX_VCPUS to 1024
  kvm: x86: Set KVM_MAX_VCPU_ID to 4*KVM_MAX_VCPUS
  KVM: VMX: avoid running vmx_handle_exit_irqoff in case of emulation
  KVM: x86/mmu: Don't freak out if pml5_root is NULL on 4-level host
  KVM: s390: index kvm->arch.idle_mask by vcpu_idx
  KVM: s390: Enable specification exception interpretation
  KVM: arm64: Trim guest debug exception handling
  KVM: SVM: Add 5-level page table support for SVM
  ...
---

192ad3c27a4895ee4b2fa31c5b54a932f5bb08c1
diff --combined Documentation/virt/kvm/api.rst
index c6212c2d5fe3,4ea1bb28297b..a6729c8cf063
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@@ -3357,6 -3357,7 +3357,7 @@@ flags which can include the following
    - KVM_GUESTDBG_INJECT_DB:     inject DB type exception [x86]
    - KVM_GUESTDBG_INJECT_BP:     inject BP type exception [x86]
    - KVM_GUESTDBG_EXIT_PENDING:  trigger an immediate guest exit [s390]
+   - KVM_GUESTDBG_BLOCKIRQ:      avoid injecting interrupts/NMI/SMI [x86]
  
  For example KVM_GUESTDBG_USE_SW_BP indicates that software breakpoints
  are enabled in memory so we need to ensure breakpoint exceptions are
@@@ -5077,7 -5078,7 +5078,7 @@@ of bytes successfully copied is returne
  then ``length`` is returned.
  
  4.131 KVM_GET_SREGS2
 -------------------
 +--------------------
  
  :Capability: KVM_CAP_SREGS2
  :Architectures: x86
@@@ -5090,17 -5091,17 +5091,17 @@@ This ioctl (when supported) replaces th
  
  ::
  
 -struct kvm_sregs2 {
 -	/* out (KVM_GET_SREGS2) / in (KVM_SET_SREGS2) */
 -	struct kvm_segment cs, ds, es, fs, gs, ss;
 -	struct kvm_segment tr, ldt;
 -	struct kvm_dtable gdt, idt;
 -	__u64 cr0, cr2, cr3, cr4, cr8;
 -	__u64 efer;
 -	__u64 apic_base;
 -	__u64 flags;
 -	__u64 pdptrs[4];
 -};
 +        struct kvm_sregs2 {
 +                /* out (KVM_GET_SREGS2) / in (KVM_SET_SREGS2) */
 +                struct kvm_segment cs, ds, es, fs, gs, ss;
 +                struct kvm_segment tr, ldt;
 +                struct kvm_dtable gdt, idt;
 +                __u64 cr0, cr2, cr3, cr4, cr8;
 +                __u64 efer;
 +                __u64 apic_base;
 +                __u64 flags;
 +                __u64 pdptrs[4];
 +        };
  
  flags values for ``kvm_sregs2``:
  
@@@ -5110,7 -5111,7 +5111,7 @@@
  
  
  4.132 KVM_SET_SREGS2
 -------------------
 +--------------------
  
  :Capability: KVM_CAP_SREGS2
  :Architectures: x86
@@@ -5201,13 -5202,15 +5202,16 @@@ trailing ``'\0'``, is indicated by the 
  The descriptors block is only needed to be read once for the lifetime of the
  file descriptor contains a sequence of ``struct kvm_stats_desc``, each followed
  by a string of size ``name_size``.
 +::
  
  	#define KVM_STATS_TYPE_SHIFT		0
  	#define KVM_STATS_TYPE_MASK		(0xF << KVM_STATS_TYPE_SHIFT)
  	#define KVM_STATS_TYPE_CUMULATIVE	(0x0 << KVM_STATS_TYPE_SHIFT)
  	#define KVM_STATS_TYPE_INSTANT		(0x1 << KVM_STATS_TYPE_SHIFT)
  	#define KVM_STATS_TYPE_PEAK		(0x2 << KVM_STATS_TYPE_SHIFT)
+ 	#define KVM_STATS_TYPE_LINEAR_HIST	(0x3 << KVM_STATS_TYPE_SHIFT)
+ 	#define KVM_STATS_TYPE_LOG_HIST		(0x4 << KVM_STATS_TYPE_SHIFT)
+ 	#define KVM_STATS_TYPE_MAX		KVM_STATS_TYPE_LOG_HIST
  
  	#define KVM_STATS_UNIT_SHIFT		4
  	#define KVM_STATS_UNIT_MASK		(0xF << KVM_STATS_UNIT_SHIFT)
@@@ -5215,18 -5218,20 +5219,20 @@@
  	#define KVM_STATS_UNIT_BYTES		(0x1 << KVM_STATS_UNIT_SHIFT)
  	#define KVM_STATS_UNIT_SECONDS		(0x2 << KVM_STATS_UNIT_SHIFT)
  	#define KVM_STATS_UNIT_CYCLES		(0x3 << KVM_STATS_UNIT_SHIFT)
+ 	#define KVM_STATS_UNIT_MAX		KVM_STATS_UNIT_CYCLES
  
  	#define KVM_STATS_BASE_SHIFT		8
  	#define KVM_STATS_BASE_MASK		(0xF << KVM_STATS_BASE_SHIFT)
  	#define KVM_STATS_BASE_POW10		(0x0 << KVM_STATS_BASE_SHIFT)
  	#define KVM_STATS_BASE_POW2		(0x1 << KVM_STATS_BASE_SHIFT)
+ 	#define KVM_STATS_BASE_MAX		KVM_STATS_BASE_POW2
  
  	struct kvm_stats_desc {
  		__u32 flags;
  		__s16 exponent;
  		__u16 size;
  		__u32 offset;
- 		__u32 unused;
+ 		__u32 bucket_size;
  		char name[];
  	};
  
@@@ -5235,26 -5240,38 +5241,40 @@@ by this descriptor. Its endianness is C
  The following flags are supported:
  
  Bits 0-3 of ``flags`` encode the type:
 +
    * ``KVM_STATS_TYPE_CUMULATIVE``
-     The statistics data is cumulative. The value of data can only be increased.
+     The statistics reports a cumulative count. The value of data can only be increased.
      Most of the counters used in KVM are of this type.
      The corresponding ``size`` field for this type is always 1.
      All cumulative statistics data are read/write.
    * ``KVM_STATS_TYPE_INSTANT``
-     The statistics data is instantaneous. Its value can be increased or
+     The statistics reports an instantaneous value. Its value can be increased or
      decreased. This type is usually used as a measurement of some resources,
      like the number of dirty pages, the number of large pages, etc.
      All instant statistics are read only.
      The corresponding ``size`` field for this type is always 1.
    * ``KVM_STATS_TYPE_PEAK``
-     The statistics data is peak. The value of data can only be increased, and
-     represents a peak value for a measurement, for example the maximum number
+     The statistics data reports a peak value, for example the maximum number
      of items in a hash table bucket, the longest time waited and so on.
+     The value of data can only be increased.
      The corresponding ``size`` field for this type is always 1.
+   * ``KVM_STATS_TYPE_LINEAR_HIST``
+     The statistic is reported as a linear histogram. The number of
+     buckets is specified by the ``size`` field. The size of buckets is specified
+     by the ``hist_param`` field. The range of the Nth bucket (1 <= N < ``size``)
+     is [``hist_param``*(N-1), ``hist_param``*N), while the range of the last
+     bucket is [``hist_param``*(``size``-1), +INF). (+INF means positive infinity
+     value.) The bucket value indicates how many samples fell in the bucket's range.
+   * ``KVM_STATS_TYPE_LOG_HIST``
+     The statistic is reported as a logarithmic histogram. The number of
+     buckets is specified by the ``size`` field. The range of the first bucket is
+     [0, 1), while the range of the last bucket is [pow(2, ``size``-2), +INF).
+     Otherwise, The Nth bucket (1 < N < ``size``) covers
+     [pow(2, N-2), pow(2, N-1)). The bucket value indicates how many samples fell
+     in the bucket's range.
  
  Bits 4-7 of ``flags`` encode the unit:
 +
    * ``KVM_STATS_UNIT_NONE``
      There is no unit for the value of statistics data. This usually means that
      the value is a simple counter of an event.
@@@ -5269,7 -5286,6 +5289,7 @@@
  
  Bits 8-11 of ``flags``, together with ``exponent``, encode the scale of the
  unit:
 +
    * ``KVM_STATS_BASE_POW10``
      The scale is based on power of 10. It is used for measurement of time and
      CPU clock cycles.  For example, an exponent of -9 can be used with
@@@ -5286,9 -5302,9 +5306,9 @@@ unsigned 64bit data
  The ``offset`` field is the offset from the start of Data Block to the start of
  the corresponding statistics data.
  
- The ``unused`` field is reserved for future support for other types of
- statistics data, like log/linear histogram. Its value is always 0 for the types
- defined above.
+ The ``bucket_size`` field is used as a parameter for histogram statistics data.
+ It is only used by linear histogram statistics data, specifying the size of a
+ bucket.
  
  The ``name`` field is the name string of the statistics data. The name string
  starts at the end of ``struct kvm_stats_desc``.  The maximum length including
@@@ -7217,7 -7233,7 +7237,7 @@@ supported in the host. A VMM can check 
  available to the guest on migration.
  
  8.33 KVM_CAP_HYPERV_ENFORCE_CPUID
 ------------------------------
 +---------------------------------
  
  Architectures: x86
  
diff --combined arch/arm64/include/asm/cpufeature.h
index cdfa2a242e9f,20e0517eb669..ef6be92b1921
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@@ -552,7 -552,7 +552,7 @@@ cpuid_feature_cap_perfmon_field(u64 fea
  	u64 mask = GENMASK_ULL(field + 3, field);
  
  	/* Treat IMPLEMENTATION DEFINED functionality as unimplemented */
 -	if (val == 0xf)
 +	if (val == ID_AA64DFR0_PMUVER_IMP_DEF)
  		val = 0;
  
  	if (val > cap) {
@@@ -602,14 -602,14 +602,14 @@@ static inline bool id_aa64pfr0_32bit_el
  {
  	u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_SHIFT);
  
- 	return val == ID_AA64PFR0_EL1_32BIT_64BIT;
+ 	return val == ID_AA64PFR0_ELx_32BIT_64BIT;
  }
  
  static inline bool id_aa64pfr0_32bit_el0(u64 pfr0)
  {
  	u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL0_SHIFT);
  
- 	return val == ID_AA64PFR0_EL0_32BIT_64BIT;
+ 	return val == ID_AA64PFR0_ELx_32BIT_64BIT;
  }
  
  static inline bool id_aa64pfr0_sve(u64 pfr0)
@@@ -784,13 -784,13 +784,13 @@@ extern int do_emulate_mrs(struct pt_reg
  static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
  {
  	switch (parange) {
- 	case 0: return 32;
- 	case 1: return 36;
- 	case 2: return 40;
- 	case 3: return 42;
- 	case 4: return 44;
- 	case 5: return 48;
- 	case 6: return 52;
+ 	case ID_AA64MMFR0_PARANGE_32: return 32;
+ 	case ID_AA64MMFR0_PARANGE_36: return 36;
+ 	case ID_AA64MMFR0_PARANGE_40: return 40;
+ 	case ID_AA64MMFR0_PARANGE_42: return 42;
+ 	case ID_AA64MMFR0_PARANGE_44: return 44;
+ 	case ID_AA64MMFR0_PARANGE_48: return 48;
+ 	case ID_AA64MMFR0_PARANGE_52: return 52;
  	/*
  	 * A future PE could use a value unknown to the kernel.
  	 * However, by the "D10.1.4 Principles of the ID scheme
diff --combined arch/arm64/include/asm/sysreg.h
index f2e06e7c0a31,7640fa27be94..b268082d67ed
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@@ -11,7 -11,6 +11,7 @@@
  
  #include <linux/bits.h>
  #include <linux/stringify.h>
 +#include <linux/kasan-tags.h>
  
  /*
   * ARMv8 ARM reserves the following encoding for system registers:
@@@ -699,7 -698,8 +699,7 @@@
  	(SCTLR_ELx_M    | SCTLR_ELx_C    | SCTLR_ELx_SA   | SCTLR_EL1_SA0   | \
  	 SCTLR_EL1_SED  | SCTLR_ELx_I    | SCTLR_EL1_DZE  | SCTLR_EL1_UCT   | \
  	 SCTLR_EL1_NTWE | SCTLR_ELx_IESB | SCTLR_EL1_SPAN | SCTLR_ELx_ITFSB | \
 -	 SCTLR_ELx_ATA  | SCTLR_EL1_ATA0 | ENDIAN_SET_EL1 | SCTLR_EL1_UCI   | \
 -	 SCTLR_EL1_EPAN | SCTLR_EL1_RES1)
 +	 ENDIAN_SET_EL1 | SCTLR_EL1_UCI  | SCTLR_EL1_EPAN | SCTLR_EL1_RES1)
  
  /* MAIR_ELx memory attributes (used by Linux) */
  #define MAIR_ATTR_DEVICE_nGnRnE		UL(0x00)
@@@ -784,14 -784,13 +784,13 @@@
  #define ID_AA64PFR0_AMU			0x1
  #define ID_AA64PFR0_SVE			0x1
  #define ID_AA64PFR0_RAS_V1		0x1
+ #define ID_AA64PFR0_RAS_V1P1		0x2
  #define ID_AA64PFR0_FP_NI		0xf
  #define ID_AA64PFR0_FP_SUPPORTED	0x0
  #define ID_AA64PFR0_ASIMD_NI		0xf
  #define ID_AA64PFR0_ASIMD_SUPPORTED	0x0
- #define ID_AA64PFR0_EL1_64BIT_ONLY	0x1
- #define ID_AA64PFR0_EL1_32BIT_64BIT	0x2
- #define ID_AA64PFR0_EL0_64BIT_ONLY	0x1
- #define ID_AA64PFR0_EL0_32BIT_64BIT	0x2
+ #define ID_AA64PFR0_ELx_64BIT_ONLY	0x1
+ #define ID_AA64PFR0_ELx_32BIT_64BIT	0x2
  
  /* id_aa64pfr1 */
  #define ID_AA64PFR1_MPAMFRAC_SHIFT	16
@@@ -847,6 -846,9 +846,9 @@@
  #define ID_AA64MMFR0_ASID_SHIFT		4
  #define ID_AA64MMFR0_PARANGE_SHIFT	0
  
+ #define ID_AA64MMFR0_ASID_8		0x0
+ #define ID_AA64MMFR0_ASID_16		0x2
+ 
  #define ID_AA64MMFR0_TGRAN4_NI			0xf
  #define ID_AA64MMFR0_TGRAN4_SUPPORTED_MIN	0x0
  #define ID_AA64MMFR0_TGRAN4_SUPPORTED_MAX	0x7
@@@ -857,9 -859,16 +859,16 @@@
  #define ID_AA64MMFR0_TGRAN16_SUPPORTED_MIN	0x1
  #define ID_AA64MMFR0_TGRAN16_SUPPORTED_MAX	0xf
  
+ #define ID_AA64MMFR0_PARANGE_32		0x0
+ #define ID_AA64MMFR0_PARANGE_36		0x1
+ #define ID_AA64MMFR0_PARANGE_40		0x2
+ #define ID_AA64MMFR0_PARANGE_42		0x3
+ #define ID_AA64MMFR0_PARANGE_44		0x4
  #define ID_AA64MMFR0_PARANGE_48		0x5
  #define ID_AA64MMFR0_PARANGE_52		0x6
  
+ #define ARM64_MIN_PARANGE_BITS		32
+ 
  #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_DEFAULT	0x0
  #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE	0x1
  #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_MIN	0x2
@@@ -904,6 -913,7 +913,7 @@@
  #define ID_AA64MMFR2_CNP_SHIFT		0
  
  /* id_aa64dfr0 */
+ #define ID_AA64DFR0_MTPMU_SHIFT		48
  #define ID_AA64DFR0_TRBE_SHIFT		44
  #define ID_AA64DFR0_TRACE_FILT_SHIFT	40
  #define ID_AA64DFR0_DOUBLELOCK_SHIFT	36
@@@ -1034,14 -1044,17 +1044,17 @@@
  #define ID_AA64MMFR0_TGRAN_SHIFT		ID_AA64MMFR0_TGRAN4_SHIFT
  #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN	ID_AA64MMFR0_TGRAN4_SUPPORTED_MIN
  #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX	ID_AA64MMFR0_TGRAN4_SUPPORTED_MAX
+ #define ID_AA64MMFR0_TGRAN_2_SHIFT		ID_AA64MMFR0_TGRAN4_2_SHIFT
  #elif defined(CONFIG_ARM64_16K_PAGES)
  #define ID_AA64MMFR0_TGRAN_SHIFT		ID_AA64MMFR0_TGRAN16_SHIFT
  #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN	ID_AA64MMFR0_TGRAN16_SUPPORTED_MIN
  #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX	ID_AA64MMFR0_TGRAN16_SUPPORTED_MAX
+ #define ID_AA64MMFR0_TGRAN_2_SHIFT		ID_AA64MMFR0_TGRAN16_2_SHIFT
  #elif defined(CONFIG_ARM64_64K_PAGES)
  #define ID_AA64MMFR0_TGRAN_SHIFT		ID_AA64MMFR0_TGRAN64_SHIFT
  #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN	ID_AA64MMFR0_TGRAN64_SUPPORTED_MIN
  #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX	ID_AA64MMFR0_TGRAN64_SUPPORTED_MAX
+ #define ID_AA64MMFR0_TGRAN_2_SHIFT		ID_AA64MMFR0_TGRAN64_2_SHIFT
  #endif
  
  #define MVFR2_FPMISC_SHIFT		4
@@@ -1071,21 -1084,6 +1084,21 @@@
  #define SYS_GCR_EL1_RRND	(BIT(16))
  #define SYS_GCR_EL1_EXCL_MASK	0xffffUL
  
 +#ifdef CONFIG_KASAN_HW_TAGS
 +/*
 + * KASAN always uses a whole byte for its tags. With CONFIG_KASAN_HW_TAGS it
 + * only uses tags in the range 0xF0-0xFF, which we map to MTE tags 0x0-0xF.
 + */
 +#define __MTE_TAG_MIN		(KASAN_TAG_MIN & 0xf)
 +#define __MTE_TAG_MAX		(KASAN_TAG_MAX & 0xf)
 +#define __MTE_TAG_INCL		GENMASK(__MTE_TAG_MAX, __MTE_TAG_MIN)
 +#define KERNEL_GCR_EL1_EXCL	(SYS_GCR_EL1_EXCL_MASK & ~__MTE_TAG_INCL)
 +#else
 +#define KERNEL_GCR_EL1_EXCL	SYS_GCR_EL1_EXCL_MASK
 +#endif
 +
 +#define KERNEL_GCR_EL1		(SYS_GCR_EL1_RRND | KERNEL_GCR_EL1_EXCL)
 +
  /* RGSR_EL1 Definitions */
  #define SYS_RGSR_EL1_TAG_MASK	0xfUL
  #define SYS_RGSR_EL1_SEED_SHIFT	8
@@@ -1172,6 -1170,11 +1185,11 @@@
  #define ICH_VTR_A3V_SHIFT	21
  #define ICH_VTR_A3V_MASK	(1 << ICH_VTR_A3V_SHIFT)
  
+ #define ARM64_FEATURE_FIELD_BITS	4
+ 
+ /* Create a mask for the feature bits of the specified feature. */
+ #define ARM64_FEATURE_MASK(x)	(GENMASK_ULL(x##_SHIFT + ARM64_FEATURE_FIELD_BITS - 1, x##_SHIFT))
+ 
  #ifdef __ASSEMBLY__
  
  	.irp	num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
diff --combined arch/arm64/kernel/cpufeature.c
index b2770d753ba3,5b59fe5e26e4..f8a3067d10c6
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@@ -67,7 -67,6 +67,7 @@@
  #include <linux/crash_dump.h>
  #include <linux/sort.h>
  #include <linux/stop_machine.h>
 +#include <linux/sysfs.h>
  #include <linux/types.h>
  #include <linux/minmax.h>
  #include <linux/mm.h>
@@@ -240,8 -239,8 +240,8 @@@ static const struct arm64_ftr_bits ftr_
  	S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI),
  	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL3_SHIFT, 4, 0),
  	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL2_SHIFT, 4, 0),
- 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_EL1_64BIT_ONLY),
- 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_EL0_64BIT_ONLY),
+ 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_ELx_64BIT_ONLY),
+ 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_ELx_64BIT_ONLY),
  	ARM64_FTR_END,
  };
  
@@@ -1322,31 -1321,6 +1322,31 @@@ const struct cpumask *system_32bit_el0_
  	return cpu_possible_mask;
  }
  
 +static int __init parse_32bit_el0_param(char *str)
 +{
 +	allow_mismatched_32bit_el0 = true;
 +	return 0;
 +}
 +early_param("allow_mismatched_32bit_el0", parse_32bit_el0_param);
 +
 +static ssize_t aarch32_el0_show(struct device *dev,
 +				struct device_attribute *attr, char *buf)
 +{
 +	const struct cpumask *mask = system_32bit_el0_cpumask();
 +
 +	return sysfs_emit(buf, "%*pbl\n", cpumask_pr_args(mask));
 +}
 +static const DEVICE_ATTR_RO(aarch32_el0);
 +
 +static int __init aarch32_el0_sysfs_init(void)
 +{
 +	if (!allow_mismatched_32bit_el0)
 +		return 0;
 +
 +	return device_create_file(cpu_subsys.dev_root, &dev_attr_aarch32_el0);
 +}
 +device_initcall(aarch32_el0_sysfs_init);
 +
  static bool has_32bit_el0(const struct arm64_cpu_capabilities *entry, int scope)
  {
  	if (!has_cpuid_feature(entry, scope))
@@@ -1587,6 -1561,8 +1587,6 @@@ kpti_install_ng_mappings(const struct a
  
  	if (!cpu)
  		arm64_use_ng_mappings = true;
 -
 -	return;
  }
  #else
  static void
@@@ -1758,7 -1734,7 +1758,7 @@@ static void cpu_has_fwb(const struct ar
  	u64 val = read_sysreg_s(SYS_CLIDR_EL1);
  
  	/* Check that CLIDR_EL1.LOU{U,IS} are both 0 */
 -	WARN_ON(val & (7 << 27 | 7 << 21));
 +	WARN_ON(CLIDR_LOUU(val) || CLIDR_LOUIS(val));
  }
  
  #ifdef CONFIG_ARM64_PAN
@@@ -1867,9 -1843,6 +1867,9 @@@ static void bti_enable(const struct arm
  #ifdef CONFIG_ARM64_MTE
  static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap)
  {
 +	sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_ATA | SCTLR_EL1_ATA0);
 +	isb();
 +
  	/*
  	 * Clear the tags in the zero page. This needs to be done via the
  	 * linear map which has the Tagged attribute.
@@@ -1983,7 -1956,7 +1983,7 @@@ static const struct arm64_cpu_capabilit
  		.sys_reg = SYS_ID_AA64PFR0_EL1,
  		.sign = FTR_UNSIGNED,
  		.field_pos = ID_AA64PFR0_EL0_SHIFT,
- 		.min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT,
+ 		.min_field_value = ID_AA64PFR0_ELx_32BIT_64BIT,
  	},
  #ifdef CONFIG_KVM
  	{
@@@ -1994,7 -1967,7 +1994,7 @@@
  		.sys_reg = SYS_ID_AA64PFR0_EL1,
  		.sign = FTR_UNSIGNED,
  		.field_pos = ID_AA64PFR0_EL1_SHIFT,
- 		.min_field_value = ID_AA64PFR0_EL1_32BIT_64BIT,
+ 		.min_field_value = ID_AA64PFR0_ELx_32BIT_64BIT,
  	},
  	{
  		.desc = "Protected KVM",
@@@ -2928,38 -2901,15 +2928,38 @@@ void __init setup_cpu_features(void
  
  static int enable_mismatched_32bit_el0(unsigned int cpu)
  {
 +	/*
 +	 * The first 32-bit-capable CPU we detected and so can no longer
 +	 * be offlined by userspace. -1 indicates we haven't yet onlined
 +	 * a 32-bit-capable CPU.
 +	 */
 +	static int lucky_winner = -1;
 +
  	struct cpuinfo_arm64 *info = &per_cpu(cpu_data, cpu);
  	bool cpu_32bit = id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0);
  
  	if (cpu_32bit) {
  		cpumask_set_cpu(cpu, cpu_32bit_el0_mask);
  		static_branch_enable_cpuslocked(&arm64_mismatched_32bit_el0);
 -		setup_elf_hwcaps(compat_elf_hwcaps);
  	}
  
 +	if (cpumask_test_cpu(0, cpu_32bit_el0_mask) == cpu_32bit)
 +		return 0;
 +
 +	if (lucky_winner >= 0)
 +		return 0;
 +
 +	/*
 +	 * We've detected a mismatch. We need to keep one of our CPUs with
 +	 * 32-bit EL0 online so that is_cpu_allowed() doesn't end up rejecting
 +	 * every CPU in the system for a 32-bit task.
 +	 */
 +	lucky_winner = cpu_32bit ? cpu : cpumask_any_and(cpu_32bit_el0_mask,
 +							 cpu_active_mask);
 +	get_cpu_device(lucky_winner)->offline_disabled = true;
 +	setup_elf_hwcaps(compat_elf_hwcaps);
 +	pr_info("Asymmetric 32-bit EL0 support detected on CPU %u; CPU hot-unplug disabled on CPU %u\n",
 +		cpu, lucky_winner);
  	return 0;
  }
  
diff --combined arch/powerpc/include/asm/kvm_host.h
index a779f7849cfb,4931d03e5799..080a7feb7731
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@@ -103,7 -103,6 +103,6 @@@ struct kvm_vcpu_stat 
  	u64 emulated_inst_exits;
  	u64 dec_exits;
  	u64 ext_intr_exits;
- 	u64 halt_wait_ns;
  	u64 halt_successful_wait;
  	u64 dbell_exits;
  	u64 gdbell_exits;
@@@ -811,8 -810,6 +810,8 @@@ struct kvm_vcpu_arch 
  
  	u32 online;
  
 +	u64 hfscr_permitted;	/* A mask of permitted HFSCR facilities */
 +
  	/* For support of nested guests */
  	struct kvm_nested_guest *nested;
  	u32 nested_vcpu_id;
diff --combined arch/powerpc/kvm/book3s_64_vio_hv.c
index 636c6ae0939b,f38dfe195ef2..870b7f0c7ea5
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@@ -80,7 -80,7 +80,7 @@@ static long kvmppc_rm_tce_to_ua(struct 
  	unsigned long gfn = tce >> PAGE_SHIFT;
  	struct kvm_memory_slot *memslot;
  
- 	memslot = search_memslots(kvm_memslots_raw(kvm), gfn);
+ 	memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
  	if (!memslot)
  		return -EINVAL;
  
@@@ -173,13 -173,10 +173,13 @@@ static void kvmppc_rm_tce_put(struct kv
  	idx -= stt->offset;
  	page = stt->pages[idx / TCES_PER_PAGE];
  	/*
 -	 * page must not be NULL in real mode,
 -	 * kvmppc_rm_ioba_validate() must have taken care of this.
 +	 * kvmppc_rm_ioba_validate() allows pages not be allocated if TCE is
 +	 * being cleared, otherwise it returns H_TOO_HARD and we skip this.
  	 */
 -	WARN_ON_ONCE_RM(!page);
 +	if (!page) {
 +		WARN_ON_ONCE_RM(tce != 0);
 +		return;
 +	}
  	tbl = kvmppc_page_address(page);
  
  	tbl[idx % TCES_PER_PAGE] = tce;
diff --combined arch/powerpc/kvm/book3s_hv.c
index bb0dacf7cbec,829ff9bb5250..2acb1c96cfaf
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@@ -59,7 -59,6 +59,7 @@@
  #include <asm/kvm_book3s.h>
  #include <asm/mmu_context.h>
  #include <asm/lppaca.h>
 +#include <asm/pmc.h>
  #include <asm/processor.h>
  #include <asm/cputhreads.h>
  #include <asm/page.h>
@@@ -1166,7 -1165,7 +1166,7 @@@ int kvmppc_pseries_do_hcall(struct kvm_
  		break;
  #endif
  	case H_RANDOM:
 -		if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4]))
 +		if (!arch_get_random_seed_long(&vcpu->arch.regs.gpr[4]))
  			ret = H_HARDWARE;
  		break;
  	case H_RPT_INVALIDATE:
@@@ -1680,21 -1679,6 +1680,21 @@@ static int kvmppc_handle_exit_hv(struc
  			r = RESUME_GUEST;
  		}
  		break;
 +
 +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 +	case BOOK3S_INTERRUPT_HV_SOFTPATCH:
 +		/*
 +		 * This occurs for various TM-related instructions that
 +		 * we need to emulate on POWER9 DD2.2.  We have already
 +		 * handled the cases where the guest was in real-suspend
 +		 * mode and was transitioning to transactional state.
 +		 */
 +		r = kvmhv_p9_tm_emulation(vcpu);
 +		if (r != -1)
 +			break;
 +		fallthrough; /* go to facility unavailable handler */
 +#endif
 +
  	/*
  	 * This occurs if the guest (kernel or userspace), does something that
  	 * is prohibited by HFSCR.
@@@ -1713,6 -1697,18 +1713,6 @@@
  		}
  		break;
  
 -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 -	case BOOK3S_INTERRUPT_HV_SOFTPATCH:
 -		/*
 -		 * This occurs for various TM-related instructions that
 -		 * we need to emulate on POWER9 DD2.2.  We have already
 -		 * handled the cases where the guest was in real-suspend
 -		 * mode and was transitioning to transactional state.
 -		 */
 -		r = kvmhv_p9_tm_emulation(vcpu);
 -		break;
 -#endif
 -
  	case BOOK3S_INTERRUPT_HV_RM_HARD:
  		r = RESUME_PASSTHROUGH;
  		break;
@@@ -1731,7 -1727,6 +1731,7 @@@
  
  static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
  {
 +	struct kvm_nested_guest *nested = vcpu->arch.nested;
  	int r;
  	int srcu_idx;
  
@@@ -1816,41 -1811,9 +1816,41 @@@
  		 * mode and was transitioning to transactional state.
  		 */
  		r = kvmhv_p9_tm_emulation(vcpu);
 -		break;
 +		if (r != -1)
 +			break;
 +		fallthrough; /* go to facility unavailable handler */
  #endif
  
 +	case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: {
 +		u64 cause = vcpu->arch.hfscr >> 56;
 +
 +		/*
 +		 * Only pass HFU interrupts to the L1 if the facility is
 +		 * permitted but disabled by the L1's HFSCR, otherwise
 +		 * the interrupt does not make sense to the L1 so turn
 +		 * it into a HEAI.
 +		 */
 +		if (!(vcpu->arch.hfscr_permitted & (1UL << cause)) ||
 +					(nested->hfscr & (1UL << cause))) {
 +			vcpu->arch.trap = BOOK3S_INTERRUPT_H_EMUL_ASSIST;
 +
 +			/*
 +			 * If the fetch failed, return to guest and
 +			 * try executing it again.
 +			 */
 +			r = kvmppc_get_last_inst(vcpu, INST_GENERIC,
 +						 &vcpu->arch.emul_inst);
 +			if (r != EMULATE_DONE)
 +				r = RESUME_GUEST;
 +			else
 +				r = RESUME_HOST;
 +		} else {
 +			r = RESUME_HOST;
 +		}
 +
 +		break;
 +	}
 +
  	case BOOK3S_INTERRUPT_HV_RM_HARD:
  		vcpu->arch.trap = 0;
  		r = RESUME_GUEST;
@@@ -2721,7 -2684,6 +2721,7 @@@ static int kvmppc_core_vcpu_create_hv(s
  	spin_lock_init(&vcpu->arch.vpa_update_lock);
  	spin_lock_init(&vcpu->arch.tbacct_lock);
  	vcpu->arch.busy_preempt = TB_NIL;
 +	vcpu->arch.shregs.msr = MSR_ME;
  	vcpu->arch.intr_msr = MSR_SF | MSR_ME;
  
  	/*
@@@ -2743,8 -2705,6 +2743,8 @@@
  	if (cpu_has_feature(CPU_FTR_TM_COMP))
  		vcpu->arch.hfscr |= HFSCR_TM;
  
 +	vcpu->arch.hfscr_permitted = vcpu->arch.hfscr;
 +
  	kvmppc_mmu_book3s_hv_init(vcpu);
  
  	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
@@@ -3767,6 -3727,7 +3767,6 @@@ static void load_spr_state(struct kvm_v
  	mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
  	mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
  	mtspr(SPRN_BESCR, vcpu->arch.bescr);
 -	mtspr(SPRN_WORT, vcpu->arch.wort);
  	mtspr(SPRN_TIDR, vcpu->arch.tid);
  	mtspr(SPRN_AMR, vcpu->arch.amr);
  	mtspr(SPRN_UAMOR, vcpu->arch.uamor);
@@@ -3793,6 -3754,7 +3793,6 @@@ static void store_spr_state(struct kvm_
  	vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
  	vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
  	vcpu->arch.bescr = mfspr(SPRN_BESCR);
 -	vcpu->arch.wort = mfspr(SPRN_WORT);
  	vcpu->arch.tid = mfspr(SPRN_TIDR);
  	vcpu->arch.amr = mfspr(SPRN_AMR);
  	vcpu->arch.uamor = mfspr(SPRN_UAMOR);
@@@ -3824,6 -3786,7 +3824,6 @@@ static void restore_p9_host_os_sprs(str
  				    struct p9_host_os_sprs *host_os_sprs)
  {
  	mtspr(SPRN_PSPB, 0);
 -	mtspr(SPRN_WORT, 0);
  	mtspr(SPRN_UAMOR, 0);
  
  	mtspr(SPRN_DSCR, host_os_sprs->dscr);
@@@ -3889,18 -3852,6 +3889,18 @@@ static int kvmhv_p9_guest_entry(struct 
  	    cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
  		kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
  
 +#ifdef CONFIG_PPC_PSERIES
 +	if (kvmhv_on_pseries()) {
 +		barrier();
 +		if (vcpu->arch.vpa.pinned_addr) {
 +			struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
 +			get_lppaca()->pmcregs_in_use = lp->pmcregs_in_use;
 +		} else {
 +			get_lppaca()->pmcregs_in_use = 1;
 +		}
 +		barrier();
 +	}
 +#endif
  	kvmhv_load_guest_pmu(vcpu);
  
  	msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
@@@ -4035,13 -3986,6 +4035,13 @@@
  	save_pmu |= nesting_enabled(vcpu->kvm);
  
  	kvmhv_save_guest_pmu(vcpu, save_pmu);
 +#ifdef CONFIG_PPC_PSERIES
 +	if (kvmhv_on_pseries()) {
 +		barrier();
 +		get_lppaca()->pmcregs_in_use = ppc_get_pmu_inuse();
 +		barrier();
 +	}
 +#endif
  
  	vc->entry_exit_map = 0x101;
  	vc->in_guest = 0;
@@@ -4202,19 -4146,31 +4202,31 @@@ out
  
  	/* Attribute wait time */
  	if (do_sleep) {
- 		vc->runner->stat.halt_wait_ns +=
+ 		vc->runner->stat.generic.halt_wait_ns +=
  			ktime_to_ns(cur) - ktime_to_ns(start_wait);
+ 		KVM_STATS_LOG_HIST_UPDATE(
+ 				vc->runner->stat.generic.halt_wait_hist,
+ 				ktime_to_ns(cur) - ktime_to_ns(start_wait));
  		/* Attribute failed poll time */
- 		if (vc->halt_poll_ns)
+ 		if (vc->halt_poll_ns) {
  			vc->runner->stat.generic.halt_poll_fail_ns +=
  				ktime_to_ns(start_wait) -
  				ktime_to_ns(start_poll);
+ 			KVM_STATS_LOG_HIST_UPDATE(
+ 				vc->runner->stat.generic.halt_poll_fail_hist,
+ 				ktime_to_ns(start_wait) -
+ 				ktime_to_ns(start_poll));
+ 		}
  	} else {
  		/* Attribute successful poll time */
- 		if (vc->halt_poll_ns)
+ 		if (vc->halt_poll_ns) {
  			vc->runner->stat.generic.halt_poll_success_ns +=
  				ktime_to_ns(cur) -
  				ktime_to_ns(start_poll);
+ 			KVM_STATS_LOG_HIST_UPDATE(
+ 				vc->runner->stat.generic.halt_poll_success_hist,
+ 				ktime_to_ns(cur) - ktime_to_ns(start_poll));
+ 		}
  	}
  
  	/* Adjust poll time */
@@@ -5384,7 -5340,6 +5396,7 @@@ static int kvmppc_set_passthru_irq(stru
  	struct kvmppc_passthru_irqmap *pimap;
  	struct irq_chip *chip;
  	int i, rc = 0;
 +	struct irq_data *host_data;
  
  	if (!kvm_irq_bypass)
  		return 1;
@@@ -5412,7 -5367,7 +5424,7 @@@
  	 * what our real-mode EOI code does, or a XIVE interrupt
  	 */
  	chip = irq_data_get_irq_chip(&desc->irq_data);
 -	if (!chip || !(is_pnv_opal_msi(chip) || is_xive_irq(chip))) {
 +	if (!chip || !is_pnv_opal_msi(chip)) {
  		pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
  			host_irq, guest_gsi);
  		mutex_unlock(&kvm->lock);
@@@ -5449,22 -5404,15 +5461,22 @@@
  	 * the KVM real mode handler.
  	 */
  	smp_wmb();
 -	irq_map->r_hwirq = desc->irq_data.hwirq;
 +
 +	/*
 +	 * The 'host_irq' number is mapped in the PCI-MSI domain but
 +	 * the underlying calls, which will EOI the interrupt in real
 +	 * mode, need an HW IRQ number mapped in the XICS IRQ domain.
 +	 */
 +	host_data = irq_domain_get_irq_data(irq_get_default_host(), host_irq);
 +	irq_map->r_hwirq = (unsigned int)irqd_to_hwirq(host_data);
  
  	if (i == pimap->n_mapped)
  		pimap->n_mapped++;
  
  	if (xics_on_xive())
 -		rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc);
 +		rc = kvmppc_xive_set_mapped(kvm, guest_gsi, host_irq);
  	else
 -		kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
 +		kvmppc_xics_set_mapped(kvm, guest_gsi, irq_map->r_hwirq);
  	if (rc)
  		irq_map->r_hwirq = 0;
  
@@@ -5503,7 -5451,7 +5515,7 @@@ static int kvmppc_clr_passthru_irq(stru
  	}
  
  	if (xics_on_xive())
 -		rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc);
 +		rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, host_irq);
  	else
  		kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
  
diff --combined arch/s390/include/asm/kvm_host.h
index d681ae462350,bf1ab0630ec1..a604d51acfc8
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@@ -244,6 -244,7 +244,7 @@@ struct kvm_s390_sie_block 
  	__u8	fpf;			/* 0x0060 */
  #define ECB_GS		0x40
  #define ECB_TE		0x10
+ #define ECB_SPECI	0x08
  #define ECB_SRSI	0x04
  #define ECB_HOSTPROTINT	0x02
  	__u8	ecb;			/* 0x0061 */
@@@ -798,12 -799,14 +799,12 @@@ struct kvm_s390_cpu_model 
  	unsigned short ibc;
  };
  
 -struct kvm_s390_module_hook {
 -	int (*hook)(struct kvm_vcpu *vcpu);
 -	struct module *owner;
 -};
 +typedef int (*crypto_hook)(struct kvm_vcpu *vcpu);
  
  struct kvm_s390_crypto {
  	struct kvm_s390_crypto_cb *crycb;
 -	struct kvm_s390_module_hook *pqap_hook;
 +	struct rw_semaphore pqap_hook_rwsem;
 +	crypto_hook *pqap_hook;
  	__u32 crycbd;
  	__u8 aes_kw;
  	__u8 dea_kw;
@@@ -955,6 -958,7 +956,7 @@@ struct kvm_arch
  	atomic64_t cmma_dirty_pages;
  	/* subset of available cpu features enabled by user space */
  	DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+ 	/* indexed by vcpu_idx */
  	DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS);
  	struct kvm_s390_gisa_interrupt gisa_int;
  	struct kvm_s390_pv pv;
diff --combined arch/s390/kvm/kvm-s390.c
index efda0615741f,1053c14c78ea..752a0ffab9bf
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@@ -66,8 -66,6 +66,6 @@@ const struct _kvm_stats_desc kvm_vm_sta
  	STATS_DESC_COUNTER(VM, inject_service_signal),
  	STATS_DESC_COUNTER(VM, inject_virtio)
  };
- static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
- 		sizeof(struct kvm_vm_stat) / sizeof(u64));
  
  const struct kvm_stats_header kvm_vm_stats_header = {
  	.name_size = KVM_STATS_NAME_SIZE,
@@@ -174,8 -172,6 +172,6 @@@ const struct _kvm_stats_desc kvm_vcpu_s
  	STATS_DESC_COUNTER(VCPU, instruction_diagnose_other),
  	STATS_DESC_COUNTER(VCPU, pfault_sync)
  };
- static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
- 		sizeof(struct kvm_vcpu_stat) / sizeof(u64));
  
  const struct kvm_stats_header kvm_vcpu_stats_header = {
  	.name_size = KVM_STATS_NAME_SIZE,
@@@ -1953,7 -1949,7 +1949,7 @@@ out
  static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn)
  {
  	int start = 0, end = slots->used_slots;
- 	int slot = atomic_read(&slots->lru_slot);
+ 	int slot = atomic_read(&slots->last_used_slot);
  	struct kvm_memory_slot *memslots = slots->memslots;
  
  	if (gfn >= memslots[slot].base_gfn &&
@@@ -1974,7 -1970,7 +1970,7 @@@
  
  	if (gfn >= memslots[start].base_gfn &&
  	    gfn < memslots[start].base_gfn + memslots[start].npages) {
- 		atomic_set(&slots->lru_slot, start);
+ 		atomic_set(&slots->last_used_slot, start);
  	}
  
  	return start;
@@@ -2559,26 -2555,12 +2555,26 @@@ static void kvm_s390_set_crycb_format(s
  		kvm->arch.crypto.crycbd |= CRYCB_FORMAT1;
  }
  
 +/*
 + * kvm_arch_crypto_set_masks
 + *
 + * @kvm: pointer to the target guest's KVM struct containing the crypto masks
 + *	 to be set.
 + * @apm: the mask identifying the accessible AP adapters
 + * @aqm: the mask identifying the accessible AP domains
 + * @adm: the mask identifying the accessible AP control domains
 + *
 + * Set the masks that identify the adapters, domains and control domains to
 + * which the KVM guest is granted access.
 + *
 + * Note: The kvm->lock mutex must be locked by the caller before invoking this
 + *	 function.
 + */
  void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm,
  			       unsigned long *aqm, unsigned long *adm)
  {
  	struct kvm_s390_crypto_cb *crycb = kvm->arch.crypto.crycb;
  
 -	mutex_lock(&kvm->lock);
  	kvm_s390_vcpu_block_all(kvm);
  
  	switch (kvm->arch.crypto.crycbd & CRYCB_FORMAT_MASK) {
@@@ -2609,23 -2591,13 +2605,23 @@@
  	/* recreate the shadow crycb for each vcpu */
  	kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART);
  	kvm_s390_vcpu_unblock_all(kvm);
 -	mutex_unlock(&kvm->lock);
  }
  EXPORT_SYMBOL_GPL(kvm_arch_crypto_set_masks);
  
 +/*
 + * kvm_arch_crypto_clear_masks
 + *
 + * @kvm: pointer to the target guest's KVM struct containing the crypto masks
 + *	 to be cleared.
 + *
 + * Clear the masks that identify the adapters, domains and control domains to
 + * which the KVM guest is granted access.
 + *
 + * Note: The kvm->lock mutex must be locked by the caller before invoking this
 + *	 function.
 + */
  void kvm_arch_crypto_clear_masks(struct kvm *kvm)
  {
 -	mutex_lock(&kvm->lock);
  	kvm_s390_vcpu_block_all(kvm);
  
  	memset(&kvm->arch.crypto.crycb->apcb0, 0,
@@@ -2637,6 -2609,7 +2633,6 @@@
  	/* recreate the shadow crycb for each vcpu */
  	kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART);
  	kvm_s390_vcpu_unblock_all(kvm);
 -	mutex_unlock(&kvm->lock);
  }
  EXPORT_SYMBOL_GPL(kvm_arch_crypto_clear_masks);
  
@@@ -2653,7 -2626,6 +2649,7 @@@ static void kvm_s390_crypto_init(struc
  {
  	kvm->arch.crypto.crycb = &kvm->arch.sie_page2->crycb;
  	kvm_s390_set_crycb_format(kvm);
 +	init_rwsem(&kvm->arch.crypto.pqap_hook_rwsem);
  
  	if (!test_kvm_facility(kvm, 76))
  		return;
@@@ -3224,6 -3196,8 +3220,8 @@@ static int kvm_s390_vcpu_setup(struct k
  		vcpu->arch.sie_block->ecb |= ECB_SRSI;
  	if (test_kvm_facility(vcpu->kvm, 73))
  		vcpu->arch.sie_block->ecb |= ECB_TE;
+ 	if (!kvm_is_ucontrol(vcpu->kvm))
+ 		vcpu->arch.sie_block->ecb |= ECB_SPECI;
  
  	if (test_kvm_facility(vcpu->kvm, 8) && vcpu->kvm->arch.use_pfmfi)
  		vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI;
@@@ -4068,7 -4042,7 +4066,7 @@@ static int vcpu_pre_run(struct kvm_vcp
  		kvm_s390_patch_guest_per_regs(vcpu);
  	}
  
- 	clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask);
+ 	clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.gisa_int.kicked_mask);
  
  	vcpu->arch.sie_block->icptcode = 0;
  	cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
diff --combined arch/x86/kvm/hyperv.c
index 41d2a53c5dea,fe4a02715266..232a86a6faaf
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@@ -88,6 -88,10 +88,10 @@@ static bool synic_has_vector_auto_eoi(s
  static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
  				int vector)
  {
+ 	struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ 	struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+ 	int auto_eoi_old, auto_eoi_new;
+ 
  	if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
  		return;
  
@@@ -96,10 -100,30 +100,30 @@@
  	else
  		__clear_bit(vector, synic->vec_bitmap);
  
+ 	auto_eoi_old = bitmap_weight(synic->auto_eoi_bitmap, 256);
+ 
  	if (synic_has_vector_auto_eoi(synic, vector))
  		__set_bit(vector, synic->auto_eoi_bitmap);
  	else
  		__clear_bit(vector, synic->auto_eoi_bitmap);
+ 
+ 	auto_eoi_new = bitmap_weight(synic->auto_eoi_bitmap, 256);
+ 
+ 	if (!!auto_eoi_old == !!auto_eoi_new)
+ 		return;
+ 
+ 	mutex_lock(&vcpu->kvm->arch.apicv_update_lock);
+ 
+ 	if (auto_eoi_new)
+ 		hv->synic_auto_eoi_used++;
+ 	else
+ 		hv->synic_auto_eoi_used--;
+ 
+ 	__kvm_request_apicv_update(vcpu->kvm,
+ 				   !hv->synic_auto_eoi_used,
+ 				   APICV_INHIBIT_REASON_HYPERV);
+ 
+ 	mutex_unlock(&vcpu->kvm->arch.apicv_update_lock);
  }
  
  static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
@@@ -933,12 -957,6 +957,6 @@@ int kvm_hv_activate_synic(struct kvm_vc
  
  	synic = to_hv_synic(vcpu);
  
- 	/*
- 	 * Hyper-V SynIC auto EOI SINT's are
- 	 * not compatible with APICV, so request
- 	 * to deactivate APICV permanently.
- 	 */
- 	kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV);
  	synic->active = true;
  	synic->dont_zero_synic_pages = dont_zero_synic_pages;
  	synic->control = HV_SYNIC_CONTROL_ENABLE;
@@@ -1933,7 -1951,7 +1951,7 @@@ ret_success
  void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu)
  {
  	struct kvm_cpuid_entry2 *entry;
 -	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
 +	struct kvm_vcpu_hv *hv_vcpu;
  
  	entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0);
  	if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) {
@@@ -2476,6 -2494,8 +2494,8 @@@ int kvm_get_hv_cpuid(struct kvm_vcpu *v
  				ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
  			if (!cpu_smt_possible())
  				ent->eax |= HV_X64_NO_NONARCH_CORESHARING;
+ 
+ 			ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED;
  			/*
  			 * Default number of spinlock retry attempts, matches
  			 * HyperV 2016.
diff --combined arch/x86/kvm/svm/nested.c
index e5515477c30a,5e13357da21e..2545d0c61985
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@@ -158,9 -158,6 +158,9 @@@ void recalc_intercepts(struct vcpu_svm 
  	/* If SMI is not intercepted, ignore guest SMI intercept as well  */
  	if (!intercept_smi)
  		vmcb_clr_intercept(c, INTERCEPT_SMI);
 +
 +	vmcb_set_intercept(c, INTERCEPT_VMLOAD);
 +	vmcb_set_intercept(c, INTERCEPT_VMSAVE);
  }
  
  static void copy_vmcb_control_area(struct vmcb_control_area *dst,
@@@ -506,11 -503,7 +506,11 @@@ static void nested_vmcb02_prepare_save(
  
  static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
  {
 -	const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
 +	const u32 int_ctl_vmcb01_bits =
 +		V_INTR_MASKING_MASK | V_GIF_MASK | V_GIF_ENABLE_MASK;
 +
 +	const u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
 +
  	struct kvm_vcpu *vcpu = &svm->vcpu;
  
  	/*
@@@ -542,8 -535,8 +542,8 @@@
  		vcpu->arch.l1_tsc_offset + svm->nested.ctl.tsc_offset;
  
  	svm->vmcb->control.int_ctl             =
 -		(svm->nested.ctl.int_ctl & ~mask) |
 -		(svm->vmcb01.ptr->control.int_ctl & mask);
 +		(svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) |
 +		(svm->vmcb01.ptr->control.int_ctl & int_ctl_vmcb01_bits);
  
  	svm->vmcb->control.virt_ext            = svm->nested.ctl.virt_ext;
  	svm->vmcb->control.int_vector          = svm->nested.ctl.int_vector;
@@@ -666,11 -659,6 +666,6 @@@ int nested_svm_vmrun(struct kvm_vcpu *v
  		goto out;
  	}
  
- 
- 	/* Clear internal status */
- 	kvm_clear_exception_queue(vcpu);
- 	kvm_clear_interrupt_queue(vcpu);
- 
  	/*
  	 * Since vmcb01 is not in use, we can use it to store some of the L1
  	 * state.
diff --combined arch/x86/kvm/svm/svm.c
index 69639f9624f5,1a70e11f0487..05e8d4d27969
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@@ -46,8 -46,6 +46,6 @@@
  #include "kvm_onhyperv.h"
  #include "svm_onhyperv.h"
  
- #define __ex(x) __kvm_handle_fault_on_reboot(x)
- 
  MODULE_AUTHOR("Qumranet");
  MODULE_LICENSE("GPL");
  
@@@ -261,7 -259,7 +259,7 @@@ u32 svm_msrpm_offset(u32 msr
  static int get_max_npt_level(void)
  {
  #ifdef CONFIG_X86_64
- 	return PT64_ROOT_4LEVEL;
+ 	return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
  #else
  	return PT32E_ROOT_LEVEL;
  #endif
@@@ -462,11 -460,6 +460,6 @@@ static int has_svm(void
  		return 0;
  	}
  
- 	if (pgtable_l5_enabled()) {
- 		pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n");
- 		return 0;
- 	}
- 
  	return 1;
  }
  
@@@ -1015,7 -1008,9 +1008,9 @@@ static __init int svm_hardware_setup(vo
  	if (!boot_cpu_has(X86_FEATURE_NPT))
  		npt_enabled = false;
  
- 	kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
+ 	/* Force VM NPT level equal to the host's max NPT level */
+ 	kvm_configure_mmu(npt_enabled, get_max_npt_level(),
+ 			  get_max_npt_level(), PG_LEVEL_1G);
  	pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
  
  	/* Note, SEV setup consumes npt_enabled. */
@@@ -1161,8 -1156,6 +1156,6 @@@ static void init_vmcb(struct kvm_vcpu *
  	struct vmcb_control_area *control = &svm->vmcb->control;
  	struct vmcb_save_area *save = &svm->vmcb->save;
  
- 	vcpu->arch.hflags = 0;
- 
  	svm_set_intercept(svm, INTERCEPT_CR0_READ);
  	svm_set_intercept(svm, INTERCEPT_CR3_READ);
  	svm_set_intercept(svm, INTERCEPT_CR4_READ);
@@@ -1241,29 -1234,14 +1234,14 @@@
  		SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
  	save->cs.limit = 0xffff;
  
+ 	save->gdtr.base = 0;
  	save->gdtr.limit = 0xffff;
+ 	save->idtr.base = 0;
  	save->idtr.limit = 0xffff;
  
  	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
  	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
  
- 	svm_set_cr4(vcpu, 0);
- 	svm_set_efer(vcpu, 0);
- 	save->dr6 = 0xffff0ff0;
- 	kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
- 	save->rip = 0x0000fff0;
- 	vcpu->arch.regs[VCPU_REGS_RIP] = save->rip;
- 
- 	/*
- 	 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
- 	 * It also updates the guest-visible cr0 value.
- 	 */
- 	svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
- 	kvm_mmu_reset_context(vcpu);
- 
- 	save->cr4 = X86_CR4_PAE;
- 	/* rdx = ?? */
- 
  	if (npt_enabled) {
  		/* Setup VMCB for Nested Paging */
  		control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
@@@ -1273,14 -1251,12 +1251,12 @@@
  		svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
  		save->g_pat = vcpu->arch.pat;
  		save->cr3 = 0;
- 		save->cr4 = 0;
  	}
  	svm->current_vmcb->asid_generation = 0;
  	svm->asid = 0;
  
  	svm->nested.vmcb12_gpa = INVALID_GPA;
  	svm->nested.last_vmcb12_gpa = INVALID_GPA;
- 	vcpu->arch.hflags = 0;
  
  	if (!kvm_pause_in_guest(vcpu->kvm)) {
  		control->pause_filter_count = pause_filter_count;
@@@ -1330,25 -1306,11 +1306,11 @@@
  static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
  {
  	struct vcpu_svm *svm = to_svm(vcpu);
- 	u32 dummy;
- 	u32 eax = 1;
  
  	svm->spec_ctrl = 0;
  	svm->virt_spec_ctrl = 0;
  
- 	if (!init_event) {
- 		vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE |
- 				       MSR_IA32_APICBASE_ENABLE;
- 		if (kvm_vcpu_is_reset_bsp(vcpu))
- 			vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
- 	}
  	init_vmcb(vcpu);
- 
- 	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
- 	kvm_rdx_write(vcpu, eax);
- 
- 	if (kvm_vcpu_apicv_active(vcpu) && !init_event)
- 		avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
  }
  
  void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
@@@ -1513,12 -1475,15 +1475,15 @@@ static void svm_vcpu_load(struct kvm_vc
  		sd->current_vmcb = svm->vmcb;
  		indirect_branch_prediction_barrier();
  	}
- 	avic_vcpu_load(vcpu, cpu);
+ 	if (kvm_vcpu_apicv_active(vcpu))
+ 		avic_vcpu_load(vcpu, cpu);
  }
  
  static void svm_vcpu_put(struct kvm_vcpu *vcpu)
  {
- 	avic_vcpu_put(vcpu);
+ 	if (kvm_vcpu_apicv_active(vcpu))
+ 		avic_vcpu_put(vcpu);
+ 
  	svm_prepare_host_switch(vcpu);
  
  	++vcpu->stat.host_state_reload;
@@@ -1560,7 -1525,7 +1525,7 @@@ static void svm_cache_reg(struct kvm_vc
  		load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
  		break;
  	default:
- 		WARN_ON_ONCE(1);
+ 		KVM_BUG_ON(1, vcpu->kvm);
  	}
  }
  
@@@ -1589,18 -1554,17 +1554,18 @@@ static void svm_set_vintr(struct vcpu_s
  
  static void svm_clear_vintr(struct vcpu_svm *svm)
  {
 -	const u32 mask = V_TPR_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK | V_INTR_MASKING_MASK;
  	svm_clr_intercept(svm, INTERCEPT_VINTR);
  
  	/* Drop int_ctl fields related to VINTR injection.  */
 -	svm->vmcb->control.int_ctl &= mask;
 +	svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
  	if (is_guest_mode(&svm->vcpu)) {
 -		svm->vmcb01.ptr->control.int_ctl &= mask;
 +		svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
  
  		WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
  			(svm->nested.ctl.int_ctl & V_TPR_MASK));
 -		svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & ~mask;
 +
 +		svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
 +			V_IRQ_INJECTION_BITS_MASK;
  	}
  
  	vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
@@@ -2078,11 -2042,15 +2043,15 @@@ static int shutdown_interception(struc
  		return -EINVAL;
  
  	/*
- 	 * VMCB is undefined after a SHUTDOWN intercept
- 	 * so reinitialize it.
+ 	 * VMCB is undefined after a SHUTDOWN intercept.  INIT the vCPU to put
+ 	 * the VMCB in a known good state.  Unfortuately, KVM doesn't have
+ 	 * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
+ 	 * userspace.  At a platform view, INIT is acceptable behavior as
+ 	 * there exist bare metal platforms that automatically INIT the CPU
+ 	 * in response to shutdown.
  	 */
  	clear_page(svm->vmcb);
- 	init_vmcb(vcpu);
+ 	kvm_vcpu_reset(vcpu, true);
  
  	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
  	return 0;
@@@ -2993,10 -2961,6 +2962,6 @@@ static int svm_set_msr(struct kvm_vcpu 
  		svm->msr_decfg = data;
  		break;
  	}
- 	case MSR_IA32_APICBASE:
- 		if (kvm_vcpu_apicv_active(vcpu))
- 			avic_update_vapic_bar(to_svm(vcpu), data);
- 		fallthrough;
  	default:
  		return kvm_set_msr_common(vcpu, msr);
  	}
@@@ -3021,7 -2985,7 +2986,7 @@@ static int interrupt_window_interceptio
  	 * In this case AVIC was temporarily disabled for
  	 * requesting the IRQ window and we have to re-enable it.
  	 */
- 	svm_toggle_avic_for_irq_window(vcpu, true);
+ 	kvm_request_apicv_update(vcpu->kvm, true, APICV_INHIBIT_REASON_IRQWIN);
  
  	++vcpu->stat.irq_window_exits;
  	return 1;
@@@ -3269,12 -3233,14 +3234,14 @@@ static void dump_vmcb(struct kvm_vcpu *
  	       "excp_to:", save->last_excp_to);
  }
  
- static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
+ static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code)
  {
- 	if (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
- 	    svm_exit_handlers[exit_code])
- 		return 0;
+ 	return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
+ 		svm_exit_handlers[exit_code]);
+ }
  
+ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
+ {
  	vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
  	dump_vmcb(vcpu);
  	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@@ -3282,14 -3248,13 +3249,13 @@@
  	vcpu->run->internal.ndata = 2;
  	vcpu->run->internal.data[0] = exit_code;
  	vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
- 
- 	return -EINVAL;
+ 	return 0;
  }
  
  int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
  {
- 	if (svm_handle_invalid_exit(vcpu, exit_code))
- 		return 0;
+ 	if (!svm_check_exit_valid(vcpu, exit_code))
+ 		return svm_handle_invalid_exit(vcpu, exit_code);
  
  #ifdef CONFIG_RETPOLINE
  	if (exit_code == SVM_EXIT_MSR)
@@@ -3573,7 -3538,7 +3539,7 @@@ static void svm_enable_irq_window(struc
  		 * via AVIC. In such case, we need to temporarily disable AVIC,
  		 * and fallback to injecting IRQ via V_IRQ.
  		 */
- 		svm_toggle_avic_for_irq_window(vcpu, false);
+ 		kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_IRQWIN);
  		svm_set_vintr(svm);
  	}
  }
@@@ -3808,6 -3773,8 +3774,8 @@@ static __no_kcsan fastpath_t svm_vcpu_r
  
  	pre_svm_run(vcpu);
  
+ 	WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
+ 
  	sync_lapic_to_cr8(vcpu);
  
  	if (unlikely(svm->asid != svm->vmcb->control.asid)) {
@@@ -4610,7 -4577,6 +4578,6 @@@ static struct kvm_x86_ops svm_x86_ops _
  	.set_virtual_apic_mode = svm_set_virtual_apic_mode,
  	.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
  	.check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
- 	.pre_update_apicv_exec_ctrl = svm_pre_update_apicv_exec_ctrl,
  	.load_eoi_exitmap = svm_load_eoi_exitmap,
  	.hwapic_irr_update = svm_hwapic_irr_update,
  	.hwapic_isr_update = svm_hwapic_isr_update,
diff --combined arch/x86/kvm/vmx/nested.c
index b3f77d18eb5a,bc6327950657..ccb03d69546c
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@@ -330,31 -330,6 +330,31 @@@ void nested_vmx_free_vcpu(struct kvm_vc
  	vcpu_put(vcpu);
  }
  
 +#define EPTP_PA_MASK   GENMASK_ULL(51, 12)
 +
 +static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
 +{
 +	return VALID_PAGE(root_hpa) &&
 +	       ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
 +}
 +
 +static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
 +				       gpa_t addr)
 +{
 +	uint i;
 +	struct kvm_mmu_root_info *cached_root;
 +
 +	WARN_ON_ONCE(!mmu_is_nested(vcpu));
 +
 +	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
 +		cached_root = &vcpu->arch.mmu->prev_roots[i];
 +
 +		if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
 +					    eptp))
 +			vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa);
 +	}
 +}
 +
  static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
  		struct x86_exception *fault)
  {
@@@ -367,22 -342,10 +367,22 @@@
  		vm_exit_reason = EXIT_REASON_PML_FULL;
  		vmx->nested.pml_full = false;
  		exit_qualification &= INTR_INFO_UNBLOCK_NMI;
 -	} else if (fault->error_code & PFERR_RSVD_MASK)
 -		vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
 -	else
 -		vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
 +	} else {
 +		if (fault->error_code & PFERR_RSVD_MASK)
 +			vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
 +		else
 +			vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
 +
 +		/*
 +		 * Although the caller (kvm_inject_emulated_page_fault) would
 +		 * have already synced the faulting address in the shadow EPT
 +		 * tables for the current EPTP12, we also need to sync it for
 +		 * any other cached EPTP02s based on the same EP4TA, since the
 +		 * TLB associates mappings to the EP4TA rather than the full EPTP.
 +		 */
 +		nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer,
 +					   fault->address);
 +	}
  
  	nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
  	vmcs12->guest_physical_address = fault->address;
@@@ -2207,7 -2170,8 +2207,8 @@@ static void prepare_vmcs02_early_rare(s
  	}
  }
  
- static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
+ 				 struct vmcs12 *vmcs12)
  {
  	u32 exec_control;
  	u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
@@@ -2218,23 -2182,22 +2219,22 @@@
  	/*
  	 * PIN CONTROLS
  	 */
- 	exec_control = vmx_pin_based_exec_ctrl(vmx);
+ 	exec_control = __pin_controls_get(vmcs01);
  	exec_control |= (vmcs12->pin_based_vm_exec_control &
  			 ~PIN_BASED_VMX_PREEMPTION_TIMER);
  
  	/* Posted interrupts setting is only taken from vmcs12.  */
- 	if (nested_cpu_has_posted_intr(vmcs12)) {
+ 	vmx->nested.pi_pending = false;
+ 	if (nested_cpu_has_posted_intr(vmcs12))
  		vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
- 		vmx->nested.pi_pending = false;
- 	} else {
+ 	else
  		exec_control &= ~PIN_BASED_POSTED_INTR;
- 	}
  	pin_controls_set(vmx, exec_control);
  
  	/*
  	 * EXEC CONTROLS
  	 */
- 	exec_control = vmx_exec_control(vmx); /* L0's desires */
+ 	exec_control = __exec_controls_get(vmcs01); /* L0's desires */
  	exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
  	exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
  	exec_control &= ~CPU_BASED_TPR_SHADOW;
@@@ -2271,10 -2234,11 +2271,11 @@@
  	 * SECONDARY EXEC CONTROLS
  	 */
  	if (cpu_has_secondary_exec_ctrls()) {
- 		exec_control = vmx->secondary_exec_control;
+ 		exec_control = __secondary_exec_controls_get(vmcs01);
  
  		/* Take the following fields only from vmcs12 */
  		exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ 				  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
  				  SECONDARY_EXEC_ENABLE_INVPCID |
  				  SECONDARY_EXEC_ENABLE_RDTSCP |
  				  SECONDARY_EXEC_XSAVES |
@@@ -2282,7 -2246,9 +2283,9 @@@
  				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
  				  SECONDARY_EXEC_APIC_REGISTER_VIRT |
  				  SECONDARY_EXEC_ENABLE_VMFUNC |
- 				  SECONDARY_EXEC_TSC_SCALING);
+ 				  SECONDARY_EXEC_TSC_SCALING |
+ 				  SECONDARY_EXEC_DESC);
+ 
  		if (nested_cpu_has(vmcs12,
  				   CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
  			exec_control |= vmcs12->secondary_vm_exec_control;
@@@ -2322,8 -2288,9 +2325,9 @@@
  	 * on the related bits (if supported by the CPU) in the hope that
  	 * we can avoid VMWrites during vmx_set_efer().
  	 */
- 	exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) &
- 			~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
+ 	exec_control = __vm_entry_controls_get(vmcs01);
+ 	exec_control |= vmcs12->vm_entry_controls;
+ 	exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
  	if (cpu_has_load_ia32_efer()) {
  		if (guest_efer & EFER_LMA)
  			exec_control |= VM_ENTRY_IA32E_MODE;
@@@ -2339,9 -2306,11 +2343,11 @@@
  	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
  	 * bits may be modified by vmx_set_efer() in prepare_vmcs02().
  	 */
- 	exec_control = vmx_vmexit_ctrl();
+ 	exec_control = __vm_exit_controls_get(vmcs01);
  	if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
  		exec_control |= VM_EXIT_LOAD_IA32_EFER;
+ 	else
+ 		exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
  	vm_exit_controls_set(vmx, exec_control);
  
  	/*
@@@ -3384,7 -3353,7 +3390,7 @@@ enum nvmx_vmentry_status nested_vmx_ent
  
  	vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
  
- 	prepare_vmcs02_early(vmx, vmcs12);
+ 	prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
  
  	if (from_vmentry) {
  		if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
@@@ -4304,7 -4273,7 +4310,7 @@@ static void load_vmcs12_host_state(stru
  		seg.l = 1;
  	else
  		seg.db = 1;
- 	vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
+ 	__vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
  	seg = (struct kvm_segment) {
  		.base = 0,
  		.limit = 0xFFFFFFFF,
@@@ -4315,17 -4284,17 +4321,17 @@@
  		.g = 1
  	};
  	seg.selector = vmcs12->host_ds_selector;
- 	vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
+ 	__vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
  	seg.selector = vmcs12->host_es_selector;
- 	vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
+ 	__vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
  	seg.selector = vmcs12->host_ss_selector;
- 	vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
+ 	__vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
  	seg.selector = vmcs12->host_fs_selector;
  	seg.base = vmcs12->host_fs_base;
- 	vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
+ 	__vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
  	seg.selector = vmcs12->host_gs_selector;
  	seg.base = vmcs12->host_gs_base;
- 	vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
+ 	__vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
  	seg = (struct kvm_segment) {
  		.base = vmcs12->host_tr_base,
  		.limit = 0x67,
@@@ -4333,14 -4302,15 +4339,15 @@@
  		.type = 11,
  		.present = 1
  	};
- 	vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+ 	__vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+ 
+ 	memset(&seg, 0, sizeof(seg));
+ 	seg.unusable = 1;
+ 	__vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
  
  	kvm_set_dr(vcpu, 7, 0x400);
  	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
  
- 	if (cpu_has_vmx_msr_bitmap())
- 		vmx_update_msr_bitmap(vcpu);
- 
  	if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
  				vmcs12->vm_exit_msr_load_count))
  		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
@@@ -4419,9 -4389,6 +4426,6 @@@ static void nested_vmx_restore_host_sta
  
  	kvm_mmu_reset_context(vcpu);
  
- 	if (cpu_has_vmx_msr_bitmap())
- 		vmx_update_msr_bitmap(vcpu);
- 
  	/*
  	 * This nasty bit of open coding is a compromise between blindly
  	 * loading L1's MSRs using the exit load lists (incorrect emulation
@@@ -5362,6 -5329,14 +5366,6 @@@ static int handle_vmptrst(struct kvm_vc
  	return nested_vmx_succeed(vcpu);
  }
  
 -#define EPTP_PA_MASK   GENMASK_ULL(51, 12)
 -
 -static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
 -{
 -	return VALID_PAGE(root_hpa) &&
 -		((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
 -}
 -
  /* Emulate the INVEPT instruction */
  static int handle_invept(struct kvm_vcpu *vcpu)
  {
@@@ -5855,8 -5830,7 +5859,8 @@@ static bool nested_vmx_l0_wants_exit(st
  		if (is_nmi(intr_info))
  			return true;
  		else if (is_page_fault(intr_info))
 -			return vcpu->arch.apf.host_apf_flags || !enable_ept;
 +			return vcpu->arch.apf.host_apf_flags ||
 +			       vmx_need_pf_intercept(vcpu);
  		else if (is_debug(intr_info) &&
  			 vcpu->guest_debug &
  			 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))