Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 7 Sep 2021 20:40:51 +0000 (13:40 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 7 Sep 2021 20:40:51 +0000 (13:40 -0700)
Pull KVM updates from Paolo Bonzini:
 "ARM:
   - Page ownership tracking between host EL1 and EL2
   - Rely on userspace page tables to create large stage-2 mappings
   - Fix incompatibility between pKVM and kmemleak
   - Fix the PMU reset state, and improve the performance of the virtual
     PMU
   - Move over to the generic KVM entry code
   - Address PSCI reset issues w.r.t. save/restore
   - Preliminary rework for the upcoming pKVM fixed feature
   - A bunch of MM cleanups
   - a vGIC fix for timer spurious interrupts
   - Various cleanups

  s390:
   - enable interpretation of specification exceptions
   - fix a vcpu_idx vs vcpu_id mixup

  x86:
   - fast (lockless) page fault support for the new MMU
   - new MMU now the default
   - increased maximum allowed VCPU count
   - allow inhibit IRQs on KVM_RUN while debugging guests
   - let Hyper-V-enabled guests run with virtualized LAPIC as long as
     they do not enable the Hyper-V "AutoEOI" feature
   - fixes and optimizations for the toggling of AMD AVIC (virtualized
     LAPIC)
   - tuning for the case when two-dimensional paging (EPT/NPT) is
     disabled
   - bugfixes and cleanups, especially with respect to vCPU reset and
     choosing a paging mode based on CR0/CR4/EFER
   - support for 5-level page table on AMD processors

  Generic:
   - MMU notifier invalidation callbacks do not take mmu_lock unless
     necessary
   - improved caching of LRU kvm_memory_slot
   - support for histogram statistics
   - add statistics for halt polling and remote TLB flush requests"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (210 commits)
  KVM: Drop unused kvm_dirty_gfn_invalid()
  KVM: x86: Update vCPU's hv_clock before back to guest when tsc_offset is adjusted
  KVM: MMU: mark role_regs and role accessors as maybe unused
  KVM: MIPS: Remove a "set but not used" variable
  x86/kvm: Don't enable IRQ when IRQ enabled in kvm_wait
  KVM: stats: Add VM stat for remote tlb flush requests
  KVM: Remove unnecessary export of kvm_{inc,dec}_notifier_count()
  KVM: x86/mmu: Move lpage_disallowed_link further "down" in kvm_mmu_page
  KVM: x86/mmu: Relocate kvm_mmu_page.tdp_mmu_page for better cache locality
  Revert "KVM: x86: mmu: Add guest physical address check in translate_gpa()"
  KVM: x86/mmu: Remove unused field mmio_cached in struct kvm_mmu_page
  kvm: x86: Increase KVM_SOFT_MAX_VCPUS to 710
  kvm: x86: Increase MAX_VCPUS to 1024
  kvm: x86: Set KVM_MAX_VCPU_ID to 4*KVM_MAX_VCPUS
  KVM: VMX: avoid running vmx_handle_exit_irqoff in case of emulation
  KVM: x86/mmu: Don't freak out if pml5_root is NULL on 4-level host
  KVM: s390: index kvm->arch.idle_mask by vcpu_idx
  KVM: s390: Enable specification exception interpretation
  KVM: arm64: Trim guest debug exception handling
  KVM: SVM: Add 5-level page table support for SVM
  ...

13 files changed:
1  2 
Documentation/virt/kvm/api.rst
arch/arm64/include/asm/cpufeature.h
arch/arm64/include/asm/sysreg.h
arch/arm64/kernel/cpufeature.c
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/kvm/book3s_64_vio_hv.c
arch/powerpc/kvm/book3s_hv.c
arch/s390/include/asm/kvm_host.h
arch/s390/kvm/kvm-s390.c
arch/x86/kvm/hyperv.c
arch/x86/kvm/svm/nested.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/vmx/nested.c

@@@ -3357,6 -3357,7 +3357,7 @@@ flags which can include the following
    - KVM_GUESTDBG_INJECT_DB:     inject DB type exception [x86]
    - KVM_GUESTDBG_INJECT_BP:     inject BP type exception [x86]
    - KVM_GUESTDBG_EXIT_PENDING:  trigger an immediate guest exit [s390]
+   - KVM_GUESTDBG_BLOCKIRQ:      avoid injecting interrupts/NMI/SMI [x86]
  
  For example KVM_GUESTDBG_USE_SW_BP indicates that software breakpoints
  are enabled in memory so we need to ensure breakpoint exceptions are
@@@ -5077,7 -5078,7 +5078,7 @@@ of bytes successfully copied is returne
  then ``length`` is returned.
  
  4.131 KVM_GET_SREGS2
 -------------------
 +--------------------
  
  :Capability: KVM_CAP_SREGS2
  :Architectures: x86
@@@ -5090,17 -5091,17 +5091,17 @@@ This ioctl (when supported) replaces th
  
  ::
  
 -struct kvm_sregs2 {
 -      /* out (KVM_GET_SREGS2) / in (KVM_SET_SREGS2) */
 -      struct kvm_segment cs, ds, es, fs, gs, ss;
 -      struct kvm_segment tr, ldt;
 -      struct kvm_dtable gdt, idt;
 -      __u64 cr0, cr2, cr3, cr4, cr8;
 -      __u64 efer;
 -      __u64 apic_base;
 -      __u64 flags;
 -      __u64 pdptrs[4];
 -};
 +        struct kvm_sregs2 {
 +                /* out (KVM_GET_SREGS2) / in (KVM_SET_SREGS2) */
 +                struct kvm_segment cs, ds, es, fs, gs, ss;
 +                struct kvm_segment tr, ldt;
 +                struct kvm_dtable gdt, idt;
 +                __u64 cr0, cr2, cr3, cr4, cr8;
 +                __u64 efer;
 +                __u64 apic_base;
 +                __u64 flags;
 +                __u64 pdptrs[4];
 +        };
  
  flags values for ``kvm_sregs2``:
  
  
  
  4.132 KVM_SET_SREGS2
 -------------------
 +--------------------
  
  :Capability: KVM_CAP_SREGS2
  :Architectures: x86
@@@ -5201,13 -5202,15 +5202,16 @@@ trailing ``'\0'``, is indicated by the 
  The descriptors block is only needed to be read once for the lifetime of the
  file descriptor contains a sequence of ``struct kvm_stats_desc``, each followed
  by a string of size ``name_size``.
 +::
  
        #define KVM_STATS_TYPE_SHIFT            0
        #define KVM_STATS_TYPE_MASK             (0xF << KVM_STATS_TYPE_SHIFT)
        #define KVM_STATS_TYPE_CUMULATIVE       (0x0 << KVM_STATS_TYPE_SHIFT)
        #define KVM_STATS_TYPE_INSTANT          (0x1 << KVM_STATS_TYPE_SHIFT)
        #define KVM_STATS_TYPE_PEAK             (0x2 << KVM_STATS_TYPE_SHIFT)
+       #define KVM_STATS_TYPE_LINEAR_HIST      (0x3 << KVM_STATS_TYPE_SHIFT)
+       #define KVM_STATS_TYPE_LOG_HIST         (0x4 << KVM_STATS_TYPE_SHIFT)
+       #define KVM_STATS_TYPE_MAX              KVM_STATS_TYPE_LOG_HIST
  
        #define KVM_STATS_UNIT_SHIFT            4
        #define KVM_STATS_UNIT_MASK             (0xF << KVM_STATS_UNIT_SHIFT)
        #define KVM_STATS_UNIT_BYTES            (0x1 << KVM_STATS_UNIT_SHIFT)
        #define KVM_STATS_UNIT_SECONDS          (0x2 << KVM_STATS_UNIT_SHIFT)
        #define KVM_STATS_UNIT_CYCLES           (0x3 << KVM_STATS_UNIT_SHIFT)
+       #define KVM_STATS_UNIT_MAX              KVM_STATS_UNIT_CYCLES
  
        #define KVM_STATS_BASE_SHIFT            8
        #define KVM_STATS_BASE_MASK             (0xF << KVM_STATS_BASE_SHIFT)
        #define KVM_STATS_BASE_POW10            (0x0 << KVM_STATS_BASE_SHIFT)
        #define KVM_STATS_BASE_POW2             (0x1 << KVM_STATS_BASE_SHIFT)
+       #define KVM_STATS_BASE_MAX              KVM_STATS_BASE_POW2
  
        struct kvm_stats_desc {
                __u32 flags;
                __s16 exponent;
                __u16 size;
                __u32 offset;
-               __u32 unused;
+               __u32 bucket_size;
                char name[];
        };
  
@@@ -5235,26 -5240,38 +5241,40 @@@ by this descriptor. Its endianness is C
  The following flags are supported:
  
  Bits 0-3 of ``flags`` encode the type:
 +
    * ``KVM_STATS_TYPE_CUMULATIVE``
-     The statistics data is cumulative. The value of data can only be increased.
+     The statistics reports a cumulative count. The value of data can only be increased.
      Most of the counters used in KVM are of this type.
      The corresponding ``size`` field for this type is always 1.
      All cumulative statistics data are read/write.
    * ``KVM_STATS_TYPE_INSTANT``
-     The statistics data is instantaneous. Its value can be increased or
+     The statistics reports an instantaneous value. Its value can be increased or
      decreased. This type is usually used as a measurement of some resources,
      like the number of dirty pages, the number of large pages, etc.
      All instant statistics are read only.
      The corresponding ``size`` field for this type is always 1.
    * ``KVM_STATS_TYPE_PEAK``
-     The statistics data is peak. The value of data can only be increased, and
-     represents a peak value for a measurement, for example the maximum number
+     The statistics data reports a peak value, for example the maximum number
      of items in a hash table bucket, the longest time waited and so on.
+     The value of data can only be increased.
      The corresponding ``size`` field for this type is always 1.
+   * ``KVM_STATS_TYPE_LINEAR_HIST``
+     The statistic is reported as a linear histogram. The number of
+     buckets is specified by the ``size`` field. The size of buckets is specified
+     by the ``hist_param`` field. The range of the Nth bucket (1 <= N < ``size``)
+     is [``hist_param``*(N-1), ``hist_param``*N), while the range of the last
+     bucket is [``hist_param``*(``size``-1), +INF). (+INF means positive infinity
+     value.) The bucket value indicates how many samples fell in the bucket's range.
+   * ``KVM_STATS_TYPE_LOG_HIST``
+     The statistic is reported as a logarithmic histogram. The number of
+     buckets is specified by the ``size`` field. The range of the first bucket is
+     [0, 1), while the range of the last bucket is [pow(2, ``size``-2), +INF).
+     Otherwise, The Nth bucket (1 < N < ``size``) covers
+     [pow(2, N-2), pow(2, N-1)). The bucket value indicates how many samples fell
+     in the bucket's range.
  
  Bits 4-7 of ``flags`` encode the unit:
 +
    * ``KVM_STATS_UNIT_NONE``
      There is no unit for the value of statistics data. This usually means that
      the value is a simple counter of an event.
  
  Bits 8-11 of ``flags``, together with ``exponent``, encode the scale of the
  unit:
 +
    * ``KVM_STATS_BASE_POW10``
      The scale is based on power of 10. It is used for measurement of time and
      CPU clock cycles.  For example, an exponent of -9 can be used with
@@@ -5286,9 -5302,9 +5306,9 @@@ unsigned 64bit data
  The ``offset`` field is the offset from the start of Data Block to the start of
  the corresponding statistics data.
  
- The ``unused`` field is reserved for future support for other types of
- statistics data, like log/linear histogram. Its value is always 0 for the types
defined above.
+ The ``bucket_size`` field is used as a parameter for histogram statistics data.
+ It is only used by linear histogram statistics data, specifying the size of a
bucket.
  
  The ``name`` field is the name string of the statistics data. The name string
  starts at the end of ``struct kvm_stats_desc``.  The maximum length including
@@@ -7217,7 -7233,7 +7237,7 @@@ supported in the host. A VMM can check 
  available to the guest on migration.
  
  8.33 KVM_CAP_HYPERV_ENFORCE_CPUID
 ------------------------------
 +---------------------------------
  
  Architectures: x86
  
@@@ -552,7 -552,7 +552,7 @@@ cpuid_feature_cap_perfmon_field(u64 fea
        u64 mask = GENMASK_ULL(field + 3, field);
  
        /* Treat IMPLEMENTATION DEFINED functionality as unimplemented */
 -      if (val == 0xf)
 +      if (val == ID_AA64DFR0_PMUVER_IMP_DEF)
                val = 0;
  
        if (val > cap) {
@@@ -602,14 -602,14 +602,14 @@@ static inline bool id_aa64pfr0_32bit_el
  {
        u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_SHIFT);
  
-       return val == ID_AA64PFR0_EL1_32BIT_64BIT;
+       return val == ID_AA64PFR0_ELx_32BIT_64BIT;
  }
  
  static inline bool id_aa64pfr0_32bit_el0(u64 pfr0)
  {
        u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL0_SHIFT);
  
-       return val == ID_AA64PFR0_EL0_32BIT_64BIT;
+       return val == ID_AA64PFR0_ELx_32BIT_64BIT;
  }
  
  static inline bool id_aa64pfr0_sve(u64 pfr0)
@@@ -784,13 -784,13 +784,13 @@@ extern int do_emulate_mrs(struct pt_reg
  static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
  {
        switch (parange) {
-       case 0: return 32;
-       case 1: return 36;
-       case 2: return 40;
-       case 3: return 42;
-       case 4: return 44;
-       case 5: return 48;
-       case 6: return 52;
+       case ID_AA64MMFR0_PARANGE_32: return 32;
+       case ID_AA64MMFR0_PARANGE_36: return 36;
+       case ID_AA64MMFR0_PARANGE_40: return 40;
+       case ID_AA64MMFR0_PARANGE_42: return 42;
+       case ID_AA64MMFR0_PARANGE_44: return 44;
+       case ID_AA64MMFR0_PARANGE_48: return 48;
+       case ID_AA64MMFR0_PARANGE_52: return 52;
        /*
         * A future PE could use a value unknown to the kernel.
         * However, by the "D10.1.4 Principles of the ID scheme
@@@ -11,7 -11,6 +11,7 @@@
  
  #include <linux/bits.h>
  #include <linux/stringify.h>
 +#include <linux/kasan-tags.h>
  
  /*
   * ARMv8 ARM reserves the following encoding for system registers:
        (SCTLR_ELx_M    | SCTLR_ELx_C    | SCTLR_ELx_SA   | SCTLR_EL1_SA0   | \
         SCTLR_EL1_SED  | SCTLR_ELx_I    | SCTLR_EL1_DZE  | SCTLR_EL1_UCT   | \
         SCTLR_EL1_NTWE | SCTLR_ELx_IESB | SCTLR_EL1_SPAN | SCTLR_ELx_ITFSB | \
 -       SCTLR_ELx_ATA  | SCTLR_EL1_ATA0 | ENDIAN_SET_EL1 | SCTLR_EL1_UCI   | \
 -       SCTLR_EL1_EPAN | SCTLR_EL1_RES1)
 +       ENDIAN_SET_EL1 | SCTLR_EL1_UCI  | SCTLR_EL1_EPAN | SCTLR_EL1_RES1)
  
  /* MAIR_ELx memory attributes (used by Linux) */
  #define MAIR_ATTR_DEVICE_nGnRnE               UL(0x00)
  #define ID_AA64PFR0_AMU                       0x1
  #define ID_AA64PFR0_SVE                       0x1
  #define ID_AA64PFR0_RAS_V1            0x1
+ #define ID_AA64PFR0_RAS_V1P1          0x2
  #define ID_AA64PFR0_FP_NI             0xf
  #define ID_AA64PFR0_FP_SUPPORTED      0x0
  #define ID_AA64PFR0_ASIMD_NI          0xf
  #define ID_AA64PFR0_ASIMD_SUPPORTED   0x0
- #define ID_AA64PFR0_EL1_64BIT_ONLY    0x1
- #define ID_AA64PFR0_EL1_32BIT_64BIT   0x2
- #define ID_AA64PFR0_EL0_64BIT_ONLY    0x1
- #define ID_AA64PFR0_EL0_32BIT_64BIT   0x2
+ #define ID_AA64PFR0_ELx_64BIT_ONLY    0x1
+ #define ID_AA64PFR0_ELx_32BIT_64BIT   0x2
  
  /* id_aa64pfr1 */
  #define ID_AA64PFR1_MPAMFRAC_SHIFT    16
  #define ID_AA64MMFR0_ASID_SHIFT               4
  #define ID_AA64MMFR0_PARANGE_SHIFT    0
  
+ #define ID_AA64MMFR0_ASID_8           0x0
+ #define ID_AA64MMFR0_ASID_16          0x2
  #define ID_AA64MMFR0_TGRAN4_NI                        0xf
  #define ID_AA64MMFR0_TGRAN4_SUPPORTED_MIN     0x0
  #define ID_AA64MMFR0_TGRAN4_SUPPORTED_MAX     0x7
  #define ID_AA64MMFR0_TGRAN16_SUPPORTED_MIN    0x1
  #define ID_AA64MMFR0_TGRAN16_SUPPORTED_MAX    0xf
  
+ #define ID_AA64MMFR0_PARANGE_32               0x0
+ #define ID_AA64MMFR0_PARANGE_36               0x1
+ #define ID_AA64MMFR0_PARANGE_40               0x2
+ #define ID_AA64MMFR0_PARANGE_42               0x3
+ #define ID_AA64MMFR0_PARANGE_44               0x4
  #define ID_AA64MMFR0_PARANGE_48               0x5
  #define ID_AA64MMFR0_PARANGE_52               0x6
  
+ #define ARM64_MIN_PARANGE_BITS                32
  #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_DEFAULT        0x0
  #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE   0x1
  #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_MIN    0x2
  #define ID_AA64MMFR2_CNP_SHIFT                0
  
  /* id_aa64dfr0 */
+ #define ID_AA64DFR0_MTPMU_SHIFT               48
  #define ID_AA64DFR0_TRBE_SHIFT                44
  #define ID_AA64DFR0_TRACE_FILT_SHIFT  40
  #define ID_AA64DFR0_DOUBLELOCK_SHIFT  36
  #define ID_AA64MMFR0_TGRAN_SHIFT              ID_AA64MMFR0_TGRAN4_SHIFT
  #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN      ID_AA64MMFR0_TGRAN4_SUPPORTED_MIN
  #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX      ID_AA64MMFR0_TGRAN4_SUPPORTED_MAX
+ #define ID_AA64MMFR0_TGRAN_2_SHIFT            ID_AA64MMFR0_TGRAN4_2_SHIFT
  #elif defined(CONFIG_ARM64_16K_PAGES)
  #define ID_AA64MMFR0_TGRAN_SHIFT              ID_AA64MMFR0_TGRAN16_SHIFT
  #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN      ID_AA64MMFR0_TGRAN16_SUPPORTED_MIN
  #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX      ID_AA64MMFR0_TGRAN16_SUPPORTED_MAX
+ #define ID_AA64MMFR0_TGRAN_2_SHIFT            ID_AA64MMFR0_TGRAN16_2_SHIFT
  #elif defined(CONFIG_ARM64_64K_PAGES)
  #define ID_AA64MMFR0_TGRAN_SHIFT              ID_AA64MMFR0_TGRAN64_SHIFT
  #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN      ID_AA64MMFR0_TGRAN64_SUPPORTED_MIN
  #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX      ID_AA64MMFR0_TGRAN64_SUPPORTED_MAX
+ #define ID_AA64MMFR0_TGRAN_2_SHIFT            ID_AA64MMFR0_TGRAN64_2_SHIFT
  #endif
  
  #define MVFR2_FPMISC_SHIFT            4
  #define SYS_GCR_EL1_RRND      (BIT(16))
  #define SYS_GCR_EL1_EXCL_MASK 0xffffUL
  
 +#ifdef CONFIG_KASAN_HW_TAGS
 +/*
 + * KASAN always uses a whole byte for its tags. With CONFIG_KASAN_HW_TAGS it
 + * only uses tags in the range 0xF0-0xFF, which we map to MTE tags 0x0-0xF.
 + */
 +#define __MTE_TAG_MIN         (KASAN_TAG_MIN & 0xf)
 +#define __MTE_TAG_MAX         (KASAN_TAG_MAX & 0xf)
 +#define __MTE_TAG_INCL                GENMASK(__MTE_TAG_MAX, __MTE_TAG_MIN)
 +#define KERNEL_GCR_EL1_EXCL   (SYS_GCR_EL1_EXCL_MASK & ~__MTE_TAG_INCL)
 +#else
 +#define KERNEL_GCR_EL1_EXCL   SYS_GCR_EL1_EXCL_MASK
 +#endif
 +
 +#define KERNEL_GCR_EL1                (SYS_GCR_EL1_RRND | KERNEL_GCR_EL1_EXCL)
 +
  /* RGSR_EL1 Definitions */
  #define SYS_RGSR_EL1_TAG_MASK 0xfUL
  #define SYS_RGSR_EL1_SEED_SHIFT       8
  #define ICH_VTR_A3V_SHIFT     21
  #define ICH_VTR_A3V_MASK      (1 << ICH_VTR_A3V_SHIFT)
  
+ #define ARM64_FEATURE_FIELD_BITS      4
+ /* Create a mask for the feature bits of the specified feature. */
+ #define ARM64_FEATURE_MASK(x) (GENMASK_ULL(x##_SHIFT + ARM64_FEATURE_FIELD_BITS - 1, x##_SHIFT))
  #ifdef __ASSEMBLY__
  
        .irp    num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
@@@ -67,7 -67,6 +67,7 @@@
  #include <linux/crash_dump.h>
  #include <linux/sort.h>
  #include <linux/stop_machine.h>
 +#include <linux/sysfs.h>
  #include <linux/types.h>
  #include <linux/minmax.h>
  #include <linux/mm.h>
@@@ -240,8 -239,8 +240,8 @@@ static const struct arm64_ftr_bits ftr_
        S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL3_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL2_SHIFT, 4, 0),
-       ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_EL1_64BIT_ONLY),
-       ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_EL0_64BIT_ONLY),
+       ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_ELx_64BIT_ONLY),
+       ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_ELx_64BIT_ONLY),
        ARM64_FTR_END,
  };
  
@@@ -1322,31 -1321,6 +1322,31 @@@ const struct cpumask *system_32bit_el0_
        return cpu_possible_mask;
  }
  
 +static int __init parse_32bit_el0_param(char *str)
 +{
 +      allow_mismatched_32bit_el0 = true;
 +      return 0;
 +}
 +early_param("allow_mismatched_32bit_el0", parse_32bit_el0_param);
 +
 +static ssize_t aarch32_el0_show(struct device *dev,
 +                              struct device_attribute *attr, char *buf)
 +{
 +      const struct cpumask *mask = system_32bit_el0_cpumask();
 +
 +      return sysfs_emit(buf, "%*pbl\n", cpumask_pr_args(mask));
 +}
 +static const DEVICE_ATTR_RO(aarch32_el0);
 +
 +static int __init aarch32_el0_sysfs_init(void)
 +{
 +      if (!allow_mismatched_32bit_el0)
 +              return 0;
 +
 +      return device_create_file(cpu_subsys.dev_root, &dev_attr_aarch32_el0);
 +}
 +device_initcall(aarch32_el0_sysfs_init);
 +
  static bool has_32bit_el0(const struct arm64_cpu_capabilities *entry, int scope)
  {
        if (!has_cpuid_feature(entry, scope))
@@@ -1587,6 -1561,8 +1587,6 @@@ kpti_install_ng_mappings(const struct a
  
        if (!cpu)
                arm64_use_ng_mappings = true;
 -
 -      return;
  }
  #else
  static void
@@@ -1758,7 -1734,7 +1758,7 @@@ static void cpu_has_fwb(const struct ar
        u64 val = read_sysreg_s(SYS_CLIDR_EL1);
  
        /* Check that CLIDR_EL1.LOU{U,IS} are both 0 */
 -      WARN_ON(val & (7 << 27 | 7 << 21));
 +      WARN_ON(CLIDR_LOUU(val) || CLIDR_LOUIS(val));
  }
  
  #ifdef CONFIG_ARM64_PAN
@@@ -1867,9 -1843,6 +1867,9 @@@ static void bti_enable(const struct arm
  #ifdef CONFIG_ARM64_MTE
  static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap)
  {
 +      sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_ATA | SCTLR_EL1_ATA0);
 +      isb();
 +
        /*
         * Clear the tags in the zero page. This needs to be done via the
         * linear map which has the Tagged attribute.
@@@ -1983,7 -1956,7 +1983,7 @@@ static const struct arm64_cpu_capabilit
                .sys_reg = SYS_ID_AA64PFR0_EL1,
                .sign = FTR_UNSIGNED,
                .field_pos = ID_AA64PFR0_EL0_SHIFT,
-               .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT,
+               .min_field_value = ID_AA64PFR0_ELx_32BIT_64BIT,
        },
  #ifdef CONFIG_KVM
        {
                .sys_reg = SYS_ID_AA64PFR0_EL1,
                .sign = FTR_UNSIGNED,
                .field_pos = ID_AA64PFR0_EL1_SHIFT,
-               .min_field_value = ID_AA64PFR0_EL1_32BIT_64BIT,
+               .min_field_value = ID_AA64PFR0_ELx_32BIT_64BIT,
        },
        {
                .desc = "Protected KVM",
@@@ -2928,38 -2901,15 +2928,38 @@@ void __init setup_cpu_features(void
  
  static int enable_mismatched_32bit_el0(unsigned int cpu)
  {
 +      /*
 +       * The first 32-bit-capable CPU we detected and so can no longer
 +       * be offlined by userspace. -1 indicates we haven't yet onlined
 +       * a 32-bit-capable CPU.
 +       */
 +      static int lucky_winner = -1;
 +
        struct cpuinfo_arm64 *info = &per_cpu(cpu_data, cpu);
        bool cpu_32bit = id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0);
  
        if (cpu_32bit) {
                cpumask_set_cpu(cpu, cpu_32bit_el0_mask);
                static_branch_enable_cpuslocked(&arm64_mismatched_32bit_el0);
 -              setup_elf_hwcaps(compat_elf_hwcaps);
        }
  
 +      if (cpumask_test_cpu(0, cpu_32bit_el0_mask) == cpu_32bit)
 +              return 0;
 +
 +      if (lucky_winner >= 0)
 +              return 0;
 +
 +      /*
 +       * We've detected a mismatch. We need to keep one of our CPUs with
 +       * 32-bit EL0 online so that is_cpu_allowed() doesn't end up rejecting
 +       * every CPU in the system for a 32-bit task.
 +       */
 +      lucky_winner = cpu_32bit ? cpu : cpumask_any_and(cpu_32bit_el0_mask,
 +                                                       cpu_active_mask);
 +      get_cpu_device(lucky_winner)->offline_disabled = true;
 +      setup_elf_hwcaps(compat_elf_hwcaps);
 +      pr_info("Asymmetric 32-bit EL0 support detected on CPU %u; CPU hot-unplug disabled on CPU %u\n",
 +              cpu, lucky_winner);
        return 0;
  }
  
@@@ -103,7 -103,6 +103,6 @@@ struct kvm_vcpu_stat 
        u64 emulated_inst_exits;
        u64 dec_exits;
        u64 ext_intr_exits;
-       u64 halt_wait_ns;
        u64 halt_successful_wait;
        u64 dbell_exits;
        u64 gdbell_exits;
@@@ -811,8 -810,6 +810,8 @@@ struct kvm_vcpu_arch 
  
        u32 online;
  
 +      u64 hfscr_permitted;    /* A mask of permitted HFSCR facilities */
 +
        /* For support of nested guests */
        struct kvm_nested_guest *nested;
        u32 nested_vcpu_id;
@@@ -80,7 -80,7 +80,7 @@@ static long kvmppc_rm_tce_to_ua(struct 
        unsigned long gfn = tce >> PAGE_SHIFT;
        struct kvm_memory_slot *memslot;
  
-       memslot = search_memslots(kvm_memslots_raw(kvm), gfn);
+       memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
        if (!memslot)
                return -EINVAL;
  
@@@ -173,13 -173,10 +173,13 @@@ static void kvmppc_rm_tce_put(struct kv
        idx -= stt->offset;
        page = stt->pages[idx / TCES_PER_PAGE];
        /*
 -       * page must not be NULL in real mode,
 -       * kvmppc_rm_ioba_validate() must have taken care of this.
 +       * kvmppc_rm_ioba_validate() allows pages not be allocated if TCE is
 +       * being cleared, otherwise it returns H_TOO_HARD and we skip this.
         */
 -      WARN_ON_ONCE_RM(!page);
 +      if (!page) {
 +              WARN_ON_ONCE_RM(tce != 0);
 +              return;
 +      }
        tbl = kvmppc_page_address(page);
  
        tbl[idx % TCES_PER_PAGE] = tce;
@@@ -59,7 -59,6 +59,7 @@@
  #include <asm/kvm_book3s.h>
  #include <asm/mmu_context.h>
  #include <asm/lppaca.h>
 +#include <asm/pmc.h>
  #include <asm/processor.h>
  #include <asm/cputhreads.h>
  #include <asm/page.h>
@@@ -1166,7 -1165,7 +1166,7 @@@ int kvmppc_pseries_do_hcall(struct kvm_
                break;
  #endif
        case H_RANDOM:
 -              if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4]))
 +              if (!arch_get_random_seed_long(&vcpu->arch.regs.gpr[4]))
                        ret = H_HARDWARE;
                break;
        case H_RPT_INVALIDATE:
@@@ -1680,21 -1679,6 +1680,21 @@@ static int kvmppc_handle_exit_hv(struc
                        r = RESUME_GUEST;
                }
                break;
 +
 +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 +      case BOOK3S_INTERRUPT_HV_SOFTPATCH:
 +              /*
 +               * This occurs for various TM-related instructions that
 +               * we need to emulate on POWER9 DD2.2.  We have already
 +               * handled the cases where the guest was in real-suspend
 +               * mode and was transitioning to transactional state.
 +               */
 +              r = kvmhv_p9_tm_emulation(vcpu);
 +              if (r != -1)
 +                      break;
 +              fallthrough; /* go to facility unavailable handler */
 +#endif
 +
        /*
         * This occurs if the guest (kernel or userspace), does something that
         * is prohibited by HFSCR.
                }
                break;
  
 -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 -      case BOOK3S_INTERRUPT_HV_SOFTPATCH:
 -              /*
 -               * This occurs for various TM-related instructions that
 -               * we need to emulate on POWER9 DD2.2.  We have already
 -               * handled the cases where the guest was in real-suspend
 -               * mode and was transitioning to transactional state.
 -               */
 -              r = kvmhv_p9_tm_emulation(vcpu);
 -              break;
 -#endif
 -
        case BOOK3S_INTERRUPT_HV_RM_HARD:
                r = RESUME_PASSTHROUGH;
                break;
  
  static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
  {
 +      struct kvm_nested_guest *nested = vcpu->arch.nested;
        int r;
        int srcu_idx;
  
                 * mode and was transitioning to transactional state.
                 */
                r = kvmhv_p9_tm_emulation(vcpu);
 -              break;
 +              if (r != -1)
 +                      break;
 +              fallthrough; /* go to facility unavailable handler */
  #endif
  
 +      case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: {
 +              u64 cause = vcpu->arch.hfscr >> 56;
 +
 +              /*
 +               * Only pass HFU interrupts to the L1 if the facility is
 +               * permitted but disabled by the L1's HFSCR, otherwise
 +               * the interrupt does not make sense to the L1 so turn
 +               * it into a HEAI.
 +               */
 +              if (!(vcpu->arch.hfscr_permitted & (1UL << cause)) ||
 +                                      (nested->hfscr & (1UL << cause))) {
 +                      vcpu->arch.trap = BOOK3S_INTERRUPT_H_EMUL_ASSIST;
 +
 +                      /*
 +                       * If the fetch failed, return to guest and
 +                       * try executing it again.
 +                       */
 +                      r = kvmppc_get_last_inst(vcpu, INST_GENERIC,
 +                                               &vcpu->arch.emul_inst);
 +                      if (r != EMULATE_DONE)
 +                              r = RESUME_GUEST;
 +                      else
 +                              r = RESUME_HOST;
 +              } else {
 +                      r = RESUME_HOST;
 +              }
 +
 +              break;
 +      }
 +
        case BOOK3S_INTERRUPT_HV_RM_HARD:
                vcpu->arch.trap = 0;
                r = RESUME_GUEST;
@@@ -2721,7 -2684,6 +2721,7 @@@ static int kvmppc_core_vcpu_create_hv(s
        spin_lock_init(&vcpu->arch.vpa_update_lock);
        spin_lock_init(&vcpu->arch.tbacct_lock);
        vcpu->arch.busy_preempt = TB_NIL;
 +      vcpu->arch.shregs.msr = MSR_ME;
        vcpu->arch.intr_msr = MSR_SF | MSR_ME;
  
        /*
        if (cpu_has_feature(CPU_FTR_TM_COMP))
                vcpu->arch.hfscr |= HFSCR_TM;
  
 +      vcpu->arch.hfscr_permitted = vcpu->arch.hfscr;
 +
        kvmppc_mmu_book3s_hv_init(vcpu);
  
        vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
@@@ -3767,6 -3727,7 +3767,6 @@@ static void load_spr_state(struct kvm_v
        mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
        mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
        mtspr(SPRN_BESCR, vcpu->arch.bescr);
 -      mtspr(SPRN_WORT, vcpu->arch.wort);
        mtspr(SPRN_TIDR, vcpu->arch.tid);
        mtspr(SPRN_AMR, vcpu->arch.amr);
        mtspr(SPRN_UAMOR, vcpu->arch.uamor);
@@@ -3793,6 -3754,7 +3793,6 @@@ static void store_spr_state(struct kvm_
        vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
        vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
        vcpu->arch.bescr = mfspr(SPRN_BESCR);
 -      vcpu->arch.wort = mfspr(SPRN_WORT);
        vcpu->arch.tid = mfspr(SPRN_TIDR);
        vcpu->arch.amr = mfspr(SPRN_AMR);
        vcpu->arch.uamor = mfspr(SPRN_UAMOR);
@@@ -3824,6 -3786,7 +3824,6 @@@ static void restore_p9_host_os_sprs(str
                                    struct p9_host_os_sprs *host_os_sprs)
  {
        mtspr(SPRN_PSPB, 0);
 -      mtspr(SPRN_WORT, 0);
        mtspr(SPRN_UAMOR, 0);
  
        mtspr(SPRN_DSCR, host_os_sprs->dscr);
@@@ -3889,18 -3852,6 +3889,18 @@@ static int kvmhv_p9_guest_entry(struct 
            cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
                kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
  
 +#ifdef CONFIG_PPC_PSERIES
 +      if (kvmhv_on_pseries()) {
 +              barrier();
 +              if (vcpu->arch.vpa.pinned_addr) {
 +                      struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
 +                      get_lppaca()->pmcregs_in_use = lp->pmcregs_in_use;
 +              } else {
 +                      get_lppaca()->pmcregs_in_use = 1;
 +              }
 +              barrier();
 +      }
 +#endif
        kvmhv_load_guest_pmu(vcpu);
  
        msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
        save_pmu |= nesting_enabled(vcpu->kvm);
  
        kvmhv_save_guest_pmu(vcpu, save_pmu);
 +#ifdef CONFIG_PPC_PSERIES
 +      if (kvmhv_on_pseries()) {
 +              barrier();
 +              get_lppaca()->pmcregs_in_use = ppc_get_pmu_inuse();
 +              barrier();
 +      }
 +#endif
  
        vc->entry_exit_map = 0x101;
        vc->in_guest = 0;
  
        /* Attribute wait time */
        if (do_sleep) {
-               vc->runner->stat.halt_wait_ns +=
+               vc->runner->stat.generic.halt_wait_ns +=
                        ktime_to_ns(cur) - ktime_to_ns(start_wait);
+               KVM_STATS_LOG_HIST_UPDATE(
+                               vc->runner->stat.generic.halt_wait_hist,
+                               ktime_to_ns(cur) - ktime_to_ns(start_wait));
                /* Attribute failed poll time */
-               if (vc->halt_poll_ns)
+               if (vc->halt_poll_ns) {
                        vc->runner->stat.generic.halt_poll_fail_ns +=
                                ktime_to_ns(start_wait) -
                                ktime_to_ns(start_poll);
+                       KVM_STATS_LOG_HIST_UPDATE(
+                               vc->runner->stat.generic.halt_poll_fail_hist,
+                               ktime_to_ns(start_wait) -
+                               ktime_to_ns(start_poll));
+               }
        } else {
                /* Attribute successful poll time */
-               if (vc->halt_poll_ns)
+               if (vc->halt_poll_ns) {
                        vc->runner->stat.generic.halt_poll_success_ns +=
                                ktime_to_ns(cur) -
                                ktime_to_ns(start_poll);
+                       KVM_STATS_LOG_HIST_UPDATE(
+                               vc->runner->stat.generic.halt_poll_success_hist,
+                               ktime_to_ns(cur) - ktime_to_ns(start_poll));
+               }
        }
  
        /* Adjust poll time */
@@@ -5384,7 -5340,6 +5396,7 @@@ static int kvmppc_set_passthru_irq(stru
        struct kvmppc_passthru_irqmap *pimap;
        struct irq_chip *chip;
        int i, rc = 0;
 +      struct irq_data *host_data;
  
        if (!kvm_irq_bypass)
                return 1;
         * what our real-mode EOI code does, or a XIVE interrupt
         */
        chip = irq_data_get_irq_chip(&desc->irq_data);
 -      if (!chip || !(is_pnv_opal_msi(chip) || is_xive_irq(chip))) {
 +      if (!chip || !is_pnv_opal_msi(chip)) {
                pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
                        host_irq, guest_gsi);
                mutex_unlock(&kvm->lock);
         * the KVM real mode handler.
         */
        smp_wmb();
 -      irq_map->r_hwirq = desc->irq_data.hwirq;
 +
 +      /*
 +       * The 'host_irq' number is mapped in the PCI-MSI domain but
 +       * the underlying calls, which will EOI the interrupt in real
 +       * mode, need an HW IRQ number mapped in the XICS IRQ domain.
 +       */
 +      host_data = irq_domain_get_irq_data(irq_get_default_host(), host_irq);
 +      irq_map->r_hwirq = (unsigned int)irqd_to_hwirq(host_data);
  
        if (i == pimap->n_mapped)
                pimap->n_mapped++;
  
        if (xics_on_xive())
 -              rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc);
 +              rc = kvmppc_xive_set_mapped(kvm, guest_gsi, host_irq);
        else
 -              kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
 +              kvmppc_xics_set_mapped(kvm, guest_gsi, irq_map->r_hwirq);
        if (rc)
                irq_map->r_hwirq = 0;
  
@@@ -5503,7 -5451,7 +5515,7 @@@ static int kvmppc_clr_passthru_irq(stru
        }
  
        if (xics_on_xive())
 -              rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc);
 +              rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, host_irq);
        else
                kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
  
@@@ -244,6 -244,7 +244,7 @@@ struct kvm_s390_sie_block 
        __u8    fpf;                    /* 0x0060 */
  #define ECB_GS                0x40
  #define ECB_TE                0x10
+ #define ECB_SPECI     0x08
  #define ECB_SRSI      0x04
  #define ECB_HOSTPROTINT       0x02
        __u8    ecb;                    /* 0x0061 */
@@@ -798,12 -799,14 +799,12 @@@ struct kvm_s390_cpu_model 
        unsigned short ibc;
  };
  
 -struct kvm_s390_module_hook {
 -      int (*hook)(struct kvm_vcpu *vcpu);
 -      struct module *owner;
 -};
 +typedef int (*crypto_hook)(struct kvm_vcpu *vcpu);
  
  struct kvm_s390_crypto {
        struct kvm_s390_crypto_cb *crycb;
 -      struct kvm_s390_module_hook *pqap_hook;
 +      struct rw_semaphore pqap_hook_rwsem;
 +      crypto_hook *pqap_hook;
        __u32 crycbd;
        __u8 aes_kw;
        __u8 dea_kw;
@@@ -955,6 -958,7 +956,7 @@@ struct kvm_arch
        atomic64_t cmma_dirty_pages;
        /* subset of available cpu features enabled by user space */
        DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+       /* indexed by vcpu_idx */
        DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS);
        struct kvm_s390_gisa_interrupt gisa_int;
        struct kvm_s390_pv pv;
diff --combined arch/s390/kvm/kvm-s390.c
@@@ -66,8 -66,6 +66,6 @@@ const struct _kvm_stats_desc kvm_vm_sta
        STATS_DESC_COUNTER(VM, inject_service_signal),
        STATS_DESC_COUNTER(VM, inject_virtio)
  };
- static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
-               sizeof(struct kvm_vm_stat) / sizeof(u64));
  
  const struct kvm_stats_header kvm_vm_stats_header = {
        .name_size = KVM_STATS_NAME_SIZE,
@@@ -174,8 -172,6 +172,6 @@@ const struct _kvm_stats_desc kvm_vcpu_s
        STATS_DESC_COUNTER(VCPU, instruction_diagnose_other),
        STATS_DESC_COUNTER(VCPU, pfault_sync)
  };
- static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
-               sizeof(struct kvm_vcpu_stat) / sizeof(u64));
  
  const struct kvm_stats_header kvm_vcpu_stats_header = {
        .name_size = KVM_STATS_NAME_SIZE,
@@@ -1953,7 -1949,7 +1949,7 @@@ out
  static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn)
  {
        int start = 0, end = slots->used_slots;
-       int slot = atomic_read(&slots->lru_slot);
+       int slot = atomic_read(&slots->last_used_slot);
        struct kvm_memory_slot *memslots = slots->memslots;
  
        if (gfn >= memslots[slot].base_gfn &&
  
        if (gfn >= memslots[start].base_gfn &&
            gfn < memslots[start].base_gfn + memslots[start].npages) {
-               atomic_set(&slots->lru_slot, start);
+               atomic_set(&slots->last_used_slot, start);
        }
  
        return start;
@@@ -2559,26 -2555,12 +2555,26 @@@ static void kvm_s390_set_crycb_format(s
                kvm->arch.crypto.crycbd |= CRYCB_FORMAT1;
  }
  
 +/*
 + * kvm_arch_crypto_set_masks
 + *
 + * @kvm: pointer to the target guest's KVM struct containing the crypto masks
 + *     to be set.
 + * @apm: the mask identifying the accessible AP adapters
 + * @aqm: the mask identifying the accessible AP domains
 + * @adm: the mask identifying the accessible AP control domains
 + *
 + * Set the masks that identify the adapters, domains and control domains to
 + * which the KVM guest is granted access.
 + *
 + * Note: The kvm->lock mutex must be locked by the caller before invoking this
 + *     function.
 + */
  void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm,
                               unsigned long *aqm, unsigned long *adm)
  {
        struct kvm_s390_crypto_cb *crycb = kvm->arch.crypto.crycb;
  
 -      mutex_lock(&kvm->lock);
        kvm_s390_vcpu_block_all(kvm);
  
        switch (kvm->arch.crypto.crycbd & CRYCB_FORMAT_MASK) {
        /* recreate the shadow crycb for each vcpu */
        kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART);
        kvm_s390_vcpu_unblock_all(kvm);
 -      mutex_unlock(&kvm->lock);
  }
  EXPORT_SYMBOL_GPL(kvm_arch_crypto_set_masks);
  
 +/*
 + * kvm_arch_crypto_clear_masks
 + *
 + * @kvm: pointer to the target guest's KVM struct containing the crypto masks
 + *     to be cleared.
 + *
 + * Clear the masks that identify the adapters, domains and control domains to
 + * which the KVM guest is granted access.
 + *
 + * Note: The kvm->lock mutex must be locked by the caller before invoking this
 + *     function.
 + */
  void kvm_arch_crypto_clear_masks(struct kvm *kvm)
  {
 -      mutex_lock(&kvm->lock);
        kvm_s390_vcpu_block_all(kvm);
  
        memset(&kvm->arch.crypto.crycb->apcb0, 0,
        /* recreate the shadow crycb for each vcpu */
        kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART);
        kvm_s390_vcpu_unblock_all(kvm);
 -      mutex_unlock(&kvm->lock);
  }
  EXPORT_SYMBOL_GPL(kvm_arch_crypto_clear_masks);
  
@@@ -2653,7 -2626,6 +2649,7 @@@ static void kvm_s390_crypto_init(struc
  {
        kvm->arch.crypto.crycb = &kvm->arch.sie_page2->crycb;
        kvm_s390_set_crycb_format(kvm);
 +      init_rwsem(&kvm->arch.crypto.pqap_hook_rwsem);
  
        if (!test_kvm_facility(kvm, 76))
                return;
@@@ -3224,6 -3196,8 +3220,8 @@@ static int kvm_s390_vcpu_setup(struct k
                vcpu->arch.sie_block->ecb |= ECB_SRSI;
        if (test_kvm_facility(vcpu->kvm, 73))
                vcpu->arch.sie_block->ecb |= ECB_TE;
+       if (!kvm_is_ucontrol(vcpu->kvm))
+               vcpu->arch.sie_block->ecb |= ECB_SPECI;
  
        if (test_kvm_facility(vcpu->kvm, 8) && vcpu->kvm->arch.use_pfmfi)
                vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI;
@@@ -4068,7 -4042,7 +4066,7 @@@ static int vcpu_pre_run(struct kvm_vcp
                kvm_s390_patch_guest_per_regs(vcpu);
        }
  
-       clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask);
+       clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.gisa_int.kicked_mask);
  
        vcpu->arch.sie_block->icptcode = 0;
        cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
diff --combined arch/x86/kvm/hyperv.c
@@@ -88,6 -88,10 +88,10 @@@ static bool synic_has_vector_auto_eoi(s
  static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
                                int vector)
  {
+       struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+       struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+       int auto_eoi_old, auto_eoi_new;
        if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
                return;
  
        else
                __clear_bit(vector, synic->vec_bitmap);
  
+       auto_eoi_old = bitmap_weight(synic->auto_eoi_bitmap, 256);
        if (synic_has_vector_auto_eoi(synic, vector))
                __set_bit(vector, synic->auto_eoi_bitmap);
        else
                __clear_bit(vector, synic->auto_eoi_bitmap);
+       auto_eoi_new = bitmap_weight(synic->auto_eoi_bitmap, 256);
+       if (!!auto_eoi_old == !!auto_eoi_new)
+               return;
+       mutex_lock(&vcpu->kvm->arch.apicv_update_lock);
+       if (auto_eoi_new)
+               hv->synic_auto_eoi_used++;
+       else
+               hv->synic_auto_eoi_used--;
+       __kvm_request_apicv_update(vcpu->kvm,
+                                  !hv->synic_auto_eoi_used,
+                                  APICV_INHIBIT_REASON_HYPERV);
+       mutex_unlock(&vcpu->kvm->arch.apicv_update_lock);
  }
  
  static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
@@@ -933,12 -957,6 +957,6 @@@ int kvm_hv_activate_synic(struct kvm_vc
  
        synic = to_hv_synic(vcpu);
  
-       /*
-        * Hyper-V SynIC auto EOI SINT's are
-        * not compatible with APICV, so request
-        * to deactivate APICV permanently.
-        */
-       kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV);
        synic->active = true;
        synic->dont_zero_synic_pages = dont_zero_synic_pages;
        synic->control = HV_SYNIC_CONTROL_ENABLE;
@@@ -1933,7 -1951,7 +1951,7 @@@ ret_success
  void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu)
  {
        struct kvm_cpuid_entry2 *entry;
 -      struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
 +      struct kvm_vcpu_hv *hv_vcpu;
  
        entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0);
        if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) {
@@@ -2476,6 -2494,8 +2494,8 @@@ int kvm_get_hv_cpuid(struct kvm_vcpu *v
                                ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
                        if (!cpu_smt_possible())
                                ent->eax |= HV_X64_NO_NONARCH_CORESHARING;
+                       ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED;
                        /*
                         * Default number of spinlock retry attempts, matches
                         * HyperV 2016.
@@@ -158,9 -158,6 +158,9 @@@ void recalc_intercepts(struct vcpu_svm 
        /* If SMI is not intercepted, ignore guest SMI intercept as well  */
        if (!intercept_smi)
                vmcb_clr_intercept(c, INTERCEPT_SMI);
 +
 +      vmcb_set_intercept(c, INTERCEPT_VMLOAD);
 +      vmcb_set_intercept(c, INTERCEPT_VMSAVE);
  }
  
  static void copy_vmcb_control_area(struct vmcb_control_area *dst,
@@@ -506,11 -503,7 +506,11 @@@ static void nested_vmcb02_prepare_save(
  
  static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
  {
 -      const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
 +      const u32 int_ctl_vmcb01_bits =
 +              V_INTR_MASKING_MASK | V_GIF_MASK | V_GIF_ENABLE_MASK;
 +
 +      const u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
 +
        struct kvm_vcpu *vcpu = &svm->vcpu;
  
        /*
                vcpu->arch.l1_tsc_offset + svm->nested.ctl.tsc_offset;
  
        svm->vmcb->control.int_ctl             =
 -              (svm->nested.ctl.int_ctl & ~mask) |
 -              (svm->vmcb01.ptr->control.int_ctl & mask);
 +              (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) |
 +              (svm->vmcb01.ptr->control.int_ctl & int_ctl_vmcb01_bits);
  
        svm->vmcb->control.virt_ext            = svm->nested.ctl.virt_ext;
        svm->vmcb->control.int_vector          = svm->nested.ctl.int_vector;
@@@ -666,11 -659,6 +666,6 @@@ int nested_svm_vmrun(struct kvm_vcpu *v
                goto out;
        }
  
-       /* Clear internal status */
-       kvm_clear_exception_queue(vcpu);
-       kvm_clear_interrupt_queue(vcpu);
        /*
         * Since vmcb01 is not in use, we can use it to store some of the L1
         * state.
diff --combined arch/x86/kvm/svm/svm.c
@@@ -46,8 -46,6 +46,6 @@@
  #include "kvm_onhyperv.h"
  #include "svm_onhyperv.h"
  
- #define __ex(x) __kvm_handle_fault_on_reboot(x)
  MODULE_AUTHOR("Qumranet");
  MODULE_LICENSE("GPL");
  
@@@ -261,7 -259,7 +259,7 @@@ u32 svm_msrpm_offset(u32 msr
  static int get_max_npt_level(void)
  {
  #ifdef CONFIG_X86_64
-       return PT64_ROOT_4LEVEL;
+       return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
  #else
        return PT32E_ROOT_LEVEL;
  #endif
@@@ -462,11 -460,6 +460,6 @@@ static int has_svm(void
                return 0;
        }
  
-       if (pgtable_l5_enabled()) {
-               pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n");
-               return 0;
-       }
        return 1;
  }
  
@@@ -1015,7 -1008,9 +1008,9 @@@ static __init int svm_hardware_setup(vo
        if (!boot_cpu_has(X86_FEATURE_NPT))
                npt_enabled = false;
  
-       kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
+       /* Force VM NPT level equal to the host's max NPT level */
+       kvm_configure_mmu(npt_enabled, get_max_npt_level(),
+                         get_max_npt_level(), PG_LEVEL_1G);
        pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
  
        /* Note, SEV setup consumes npt_enabled. */
@@@ -1161,8 -1156,6 +1156,6 @@@ static void init_vmcb(struct kvm_vcpu *
        struct vmcb_control_area *control = &svm->vmcb->control;
        struct vmcb_save_area *save = &svm->vmcb->save;
  
-       vcpu->arch.hflags = 0;
        svm_set_intercept(svm, INTERCEPT_CR0_READ);
        svm_set_intercept(svm, INTERCEPT_CR3_READ);
        svm_set_intercept(svm, INTERCEPT_CR4_READ);
                SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
        save->cs.limit = 0xffff;
  
+       save->gdtr.base = 0;
        save->gdtr.limit = 0xffff;
+       save->idtr.base = 0;
        save->idtr.limit = 0xffff;
  
        init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
        init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
  
-       svm_set_cr4(vcpu, 0);
-       svm_set_efer(vcpu, 0);
-       save->dr6 = 0xffff0ff0;
-       kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
-       save->rip = 0x0000fff0;
-       vcpu->arch.regs[VCPU_REGS_RIP] = save->rip;
-       /*
-        * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
-        * It also updates the guest-visible cr0 value.
-        */
-       svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
-       kvm_mmu_reset_context(vcpu);
-       save->cr4 = X86_CR4_PAE;
-       /* rdx = ?? */
        if (npt_enabled) {
                /* Setup VMCB for Nested Paging */
                control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
                svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
                save->g_pat = vcpu->arch.pat;
                save->cr3 = 0;
-               save->cr4 = 0;
        }
        svm->current_vmcb->asid_generation = 0;
        svm->asid = 0;
  
        svm->nested.vmcb12_gpa = INVALID_GPA;
        svm->nested.last_vmcb12_gpa = INVALID_GPA;
-       vcpu->arch.hflags = 0;
  
        if (!kvm_pause_in_guest(vcpu->kvm)) {
                control->pause_filter_count = pause_filter_count;
  static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
  {
        struct vcpu_svm *svm = to_svm(vcpu);
-       u32 dummy;
-       u32 eax = 1;
  
        svm->spec_ctrl = 0;
        svm->virt_spec_ctrl = 0;
  
-       if (!init_event) {
-               vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE |
-                                      MSR_IA32_APICBASE_ENABLE;
-               if (kvm_vcpu_is_reset_bsp(vcpu))
-                       vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
-       }
        init_vmcb(vcpu);
-       kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
-       kvm_rdx_write(vcpu, eax);
-       if (kvm_vcpu_apicv_active(vcpu) && !init_event)
-               avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
  }
  
  void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
@@@ -1513,12 -1475,15 +1475,15 @@@ static void svm_vcpu_load(struct kvm_vc
                sd->current_vmcb = svm->vmcb;
                indirect_branch_prediction_barrier();
        }
-       avic_vcpu_load(vcpu, cpu);
+       if (kvm_vcpu_apicv_active(vcpu))
+               avic_vcpu_load(vcpu, cpu);
  }
  
  static void svm_vcpu_put(struct kvm_vcpu *vcpu)
  {
-       avic_vcpu_put(vcpu);
+       if (kvm_vcpu_apicv_active(vcpu))
+               avic_vcpu_put(vcpu);
        svm_prepare_host_switch(vcpu);
  
        ++vcpu->stat.host_state_reload;
@@@ -1560,7 -1525,7 +1525,7 @@@ static void svm_cache_reg(struct kvm_vc
                load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
                break;
        default:
-               WARN_ON_ONCE(1);
+               KVM_BUG_ON(1, vcpu->kvm);
        }
  }
  
@@@ -1589,18 -1554,17 +1554,18 @@@ static void svm_set_vintr(struct vcpu_s
  
  static void svm_clear_vintr(struct vcpu_svm *svm)
  {
 -      const u32 mask = V_TPR_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK | V_INTR_MASKING_MASK;
        svm_clr_intercept(svm, INTERCEPT_VINTR);
  
        /* Drop int_ctl fields related to VINTR injection.  */
 -      svm->vmcb->control.int_ctl &= mask;
 +      svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
        if (is_guest_mode(&svm->vcpu)) {
 -              svm->vmcb01.ptr->control.int_ctl &= mask;
 +              svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
  
                WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
                        (svm->nested.ctl.int_ctl & V_TPR_MASK));
 -              svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & ~mask;
 +
 +              svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
 +                      V_IRQ_INJECTION_BITS_MASK;
        }
  
        vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
@@@ -2078,11 -2042,15 +2043,15 @@@ static int shutdown_interception(struc
                return -EINVAL;
  
        /*
-        * VMCB is undefined after a SHUTDOWN intercept
-        * so reinitialize it.
+        * VMCB is undefined after a SHUTDOWN intercept.  INIT the vCPU to put
+        * the VMCB in a known good state.  Unfortuately, KVM doesn't have
+        * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
+        * userspace.  At a platform view, INIT is acceptable behavior as
+        * there exist bare metal platforms that automatically INIT the CPU
+        * in response to shutdown.
         */
        clear_page(svm->vmcb);
-       init_vmcb(vcpu);
+       kvm_vcpu_reset(vcpu, true);
  
        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
        return 0;
@@@ -2993,10 -2961,6 +2962,6 @@@ static int svm_set_msr(struct kvm_vcpu 
                svm->msr_decfg = data;
                break;
        }
-       case MSR_IA32_APICBASE:
-               if (kvm_vcpu_apicv_active(vcpu))
-                       avic_update_vapic_bar(to_svm(vcpu), data);
-               fallthrough;
        default:
                return kvm_set_msr_common(vcpu, msr);
        }
@@@ -3021,7 -2985,7 +2986,7 @@@ static int interrupt_window_interceptio
         * In this case AVIC was temporarily disabled for
         * requesting the IRQ window and we have to re-enable it.
         */
-       svm_toggle_avic_for_irq_window(vcpu, true);
+       kvm_request_apicv_update(vcpu->kvm, true, APICV_INHIBIT_REASON_IRQWIN);
  
        ++vcpu->stat.irq_window_exits;
        return 1;
@@@ -3269,12 -3233,14 +3234,14 @@@ static void dump_vmcb(struct kvm_vcpu *
               "excp_to:", save->last_excp_to);
  }
  
- static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
+ static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code)
  {
-       if (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
-           svm_exit_handlers[exit_code])
-               return 0;
+       return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
+               svm_exit_handlers[exit_code]);
+ }
  
+ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
+ {
        vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
        dump_vmcb(vcpu);
        vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
        vcpu->run->internal.ndata = 2;
        vcpu->run->internal.data[0] = exit_code;
        vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
-       return -EINVAL;
+       return 0;
  }
  
  int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
  {
-       if (svm_handle_invalid_exit(vcpu, exit_code))
-               return 0;
+       if (!svm_check_exit_valid(vcpu, exit_code))
+               return svm_handle_invalid_exit(vcpu, exit_code);
  
  #ifdef CONFIG_RETPOLINE
        if (exit_code == SVM_EXIT_MSR)
@@@ -3573,7 -3538,7 +3539,7 @@@ static void svm_enable_irq_window(struc
                 * via AVIC. In such case, we need to temporarily disable AVIC,
                 * and fallback to injecting IRQ via V_IRQ.
                 */
-               svm_toggle_avic_for_irq_window(vcpu, false);
+               kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_IRQWIN);
                svm_set_vintr(svm);
        }
  }
@@@ -3808,6 -3773,8 +3774,8 @@@ static __no_kcsan fastpath_t svm_vcpu_r
  
        pre_svm_run(vcpu);
  
+       WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
        sync_lapic_to_cr8(vcpu);
  
        if (unlikely(svm->asid != svm->vmcb->control.asid)) {
@@@ -4610,7 -4577,6 +4578,6 @@@ static struct kvm_x86_ops svm_x86_ops _
        .set_virtual_apic_mode = svm_set_virtual_apic_mode,
        .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
        .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
-       .pre_update_apicv_exec_ctrl = svm_pre_update_apicv_exec_ctrl,
        .load_eoi_exitmap = svm_load_eoi_exitmap,
        .hwapic_irr_update = svm_hwapic_irr_update,
        .hwapic_isr_update = svm_hwapic_isr_update,
@@@ -330,31 -330,6 +330,31 @@@ void nested_vmx_free_vcpu(struct kvm_vc
        vcpu_put(vcpu);
  }
  
 +#define EPTP_PA_MASK   GENMASK_ULL(51, 12)
 +
 +static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
 +{
 +      return VALID_PAGE(root_hpa) &&
 +             ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
 +}
 +
 +static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
 +                                     gpa_t addr)
 +{
 +      uint i;
 +      struct kvm_mmu_root_info *cached_root;
 +
 +      WARN_ON_ONCE(!mmu_is_nested(vcpu));
 +
 +      for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
 +              cached_root = &vcpu->arch.mmu->prev_roots[i];
 +
 +              if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
 +                                          eptp))
 +                      vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa);
 +      }
 +}
 +
  static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
                struct x86_exception *fault)
  {
                vm_exit_reason = EXIT_REASON_PML_FULL;
                vmx->nested.pml_full = false;
                exit_qualification &= INTR_INFO_UNBLOCK_NMI;
 -      } else if (fault->error_code & PFERR_RSVD_MASK)
 -              vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
 -      else
 -              vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
 +      } else {
 +              if (fault->error_code & PFERR_RSVD_MASK)
 +                      vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
 +              else
 +                      vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
 +
 +              /*
 +               * Although the caller (kvm_inject_emulated_page_fault) would
 +               * have already synced the faulting address in the shadow EPT
 +               * tables for the current EPTP12, we also need to sync it for
 +               * any other cached EPTP02s based on the same EP4TA, since the
 +               * TLB associates mappings to the EP4TA rather than the full EPTP.
 +               */
 +              nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer,
 +                                         fault->address);
 +      }
  
        nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
        vmcs12->guest_physical_address = fault->address;
@@@ -2207,7 -2170,8 +2207,8 @@@ static void prepare_vmcs02_early_rare(s
        }
  }
  
- static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
+                                struct vmcs12 *vmcs12)
  {
        u32 exec_control;
        u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
        /*
         * PIN CONTROLS
         */
-       exec_control = vmx_pin_based_exec_ctrl(vmx);
+       exec_control = __pin_controls_get(vmcs01);
        exec_control |= (vmcs12->pin_based_vm_exec_control &
                         ~PIN_BASED_VMX_PREEMPTION_TIMER);
  
        /* Posted interrupts setting is only taken from vmcs12.  */
-       if (nested_cpu_has_posted_intr(vmcs12)) {
+       vmx->nested.pi_pending = false;
+       if (nested_cpu_has_posted_intr(vmcs12))
                vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
-               vmx->nested.pi_pending = false;
-       } else {
+       else
                exec_control &= ~PIN_BASED_POSTED_INTR;
-       }
        pin_controls_set(vmx, exec_control);
  
        /*
         * EXEC CONTROLS
         */
-       exec_control = vmx_exec_control(vmx); /* L0's desires */
+       exec_control = __exec_controls_get(vmcs01); /* L0's desires */
        exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
        exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
        exec_control &= ~CPU_BASED_TPR_SHADOW;
         * SECONDARY EXEC CONTROLS
         */
        if (cpu_has_secondary_exec_ctrls()) {
-               exec_control = vmx->secondary_exec_control;
+               exec_control = __secondary_exec_controls_get(vmcs01);
  
                /* Take the following fields only from vmcs12 */
                exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
                                  SECONDARY_EXEC_ENABLE_INVPCID |
                                  SECONDARY_EXEC_ENABLE_RDTSCP |
                                  SECONDARY_EXEC_XSAVES |
                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
                                  SECONDARY_EXEC_APIC_REGISTER_VIRT |
                                  SECONDARY_EXEC_ENABLE_VMFUNC |
-                                 SECONDARY_EXEC_TSC_SCALING);
+                                 SECONDARY_EXEC_TSC_SCALING |
+                                 SECONDARY_EXEC_DESC);
                if (nested_cpu_has(vmcs12,
                                   CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
                        exec_control |= vmcs12->secondary_vm_exec_control;
         * on the related bits (if supported by the CPU) in the hope that
         * we can avoid VMWrites during vmx_set_efer().
         */
-       exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) &
-                       ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
+       exec_control = __vm_entry_controls_get(vmcs01);
+       exec_control |= vmcs12->vm_entry_controls;
+       exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
        if (cpu_has_load_ia32_efer()) {
                if (guest_efer & EFER_LMA)
                        exec_control |= VM_ENTRY_IA32E_MODE;
         * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
         * bits may be modified by vmx_set_efer() in prepare_vmcs02().
         */
-       exec_control = vmx_vmexit_ctrl();
+       exec_control = __vm_exit_controls_get(vmcs01);
        if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
                exec_control |= VM_EXIT_LOAD_IA32_EFER;
+       else
+               exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
        vm_exit_controls_set(vmx, exec_control);
  
        /*
@@@ -3384,7 -3353,7 +3390,7 @@@ enum nvmx_vmentry_status nested_vmx_ent
  
        vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
  
-       prepare_vmcs02_early(vmx, vmcs12);
+       prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
  
        if (from_vmentry) {
                if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
@@@ -4304,7 -4273,7 +4310,7 @@@ static void load_vmcs12_host_state(stru
                seg.l = 1;
        else
                seg.db = 1;
-       vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
+       __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
        seg = (struct kvm_segment) {
                .base = 0,
                .limit = 0xFFFFFFFF,
                .g = 1
        };
        seg.selector = vmcs12->host_ds_selector;
-       vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
+       __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
        seg.selector = vmcs12->host_es_selector;
-       vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
+       __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
        seg.selector = vmcs12->host_ss_selector;
-       vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
+       __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
        seg.selector = vmcs12->host_fs_selector;
        seg.base = vmcs12->host_fs_base;
-       vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
+       __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
        seg.selector = vmcs12->host_gs_selector;
        seg.base = vmcs12->host_gs_base;
-       vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
+       __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
        seg = (struct kvm_segment) {
                .base = vmcs12->host_tr_base,
                .limit = 0x67,
                .type = 11,
                .present = 1
        };
-       vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+       __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+       memset(&seg, 0, sizeof(seg));
+       seg.unusable = 1;
+       __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
  
        kvm_set_dr(vcpu, 7, 0x400);
        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
  
-       if (cpu_has_vmx_msr_bitmap())
-               vmx_update_msr_bitmap(vcpu);
        if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
                                vmcs12->vm_exit_msr_load_count))
                nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
@@@ -4419,9 -4389,6 +4426,6 @@@ static void nested_vmx_restore_host_sta
  
        kvm_mmu_reset_context(vcpu);
  
-       if (cpu_has_vmx_msr_bitmap())
-               vmx_update_msr_bitmap(vcpu);
        /*
         * This nasty bit of open coding is a compromise between blindly
         * loading L1's MSRs using the exit load lists (incorrect emulation
@@@ -5362,6 -5329,14 +5366,6 @@@ static int handle_vmptrst(struct kvm_vc
        return nested_vmx_succeed(vcpu);
  }
  
 -#define EPTP_PA_MASK   GENMASK_ULL(51, 12)
 -
 -static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
 -{
 -      return VALID_PAGE(root_hpa) &&
 -              ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
 -}
 -
  /* Emulate the INVEPT instruction */
  static int handle_invept(struct kvm_vcpu *vcpu)
  {
@@@ -5855,8 -5830,7 +5859,8 @@@ static bool nested_vmx_l0_wants_exit(st
                if (is_nmi(intr_info))
                        return true;
                else if (is_page_fault(intr_info))
 -                      return vcpu->arch.apf.host_apf_flags || !enable_ept;
 +                      return vcpu->arch.apf.host_apf_flags ||
 +                             vmx_need_pf_intercept(vcpu);
                else if (is_debug(intr_info) &&
                         vcpu->guest_debug &
                         (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))