Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 7 Sep 2021 20:40:51 +0000 (13:40 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 7 Sep 2021 20:40:51 +0000 (13:40 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 7 Sep 2021 20:40:51 +0000 (13:40 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 7 Sep 2021 20:40:51 +0000 (13:40 -0700)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst

index c6212c2..a6729c8 100644 (file)
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -3357,6 +3357,7 @@ flags which can include the following:
    - KVM_GUESTDBG_INJECT_DB:     inject DB type exception [x86]
    - KVM_GUESTDBG_INJECT_BP:     inject BP type exception [x86]
    - KVM_GUESTDBG_EXIT_PENDING:  trigger an immediate guest exit [s390]
+  - KVM_GUESTDBG_BLOCKIRQ:      avoid injecting interrupts/NMI/SMI [x86]
  
  For example KVM_GUESTDBG_USE_SW_BP indicates that software breakpoints
  are enabled in memory so we need to ensure breakpoint exceptions are
@@ -5208,6 +5209,9 @@ by a string of size ``name_size``.
         #define KVM_STATS_TYPE_CUMULATIVE       (0x0 << KVM_STATS_TYPE_SHIFT)
         #define KVM_STATS_TYPE_INSTANT          (0x1 << KVM_STATS_TYPE_SHIFT)
         #define KVM_STATS_TYPE_PEAK             (0x2 << KVM_STATS_TYPE_SHIFT)
+       #define KVM_STATS_TYPE_LINEAR_HIST      (0x3 << KVM_STATS_TYPE_SHIFT)
+       #define KVM_STATS_TYPE_LOG_HIST         (0x4 << KVM_STATS_TYPE_SHIFT)
+       #define KVM_STATS_TYPE_MAX              KVM_STATS_TYPE_LOG_HIST
  
         #define KVM_STATS_UNIT_SHIFT            4
         #define KVM_STATS_UNIT_MASK             (0xF << KVM_STATS_UNIT_SHIFT)
@@ -5215,18 +5219,20 @@ by a string of size ``name_size``.
         #define KVM_STATS_UNIT_BYTES            (0x1 << KVM_STATS_UNIT_SHIFT)
         #define KVM_STATS_UNIT_SECONDS          (0x2 << KVM_STATS_UNIT_SHIFT)
         #define KVM_STATS_UNIT_CYCLES           (0x3 << KVM_STATS_UNIT_SHIFT)
+       #define KVM_STATS_UNIT_MAX              KVM_STATS_UNIT_CYCLES
  
         #define KVM_STATS_BASE_SHIFT            8
         #define KVM_STATS_BASE_MASK             (0xF << KVM_STATS_BASE_SHIFT)
         #define KVM_STATS_BASE_POW10            (0x0 << KVM_STATS_BASE_SHIFT)
         #define KVM_STATS_BASE_POW2             (0x1 << KVM_STATS_BASE_SHIFT)
+       #define KVM_STATS_BASE_MAX              KVM_STATS_BASE_POW2
  
         struct kvm_stats_desc {
                 __u32 flags;
                 __s16 exponent;
                 __u16 size;
                 __u32 offset;
-               __u32 unused;
+               __u32 bucket_size;
                 char name[];
         };
  
@@ -5237,21 +5243,35 @@ The following flags are supported:
  Bits 0-3 of ``flags`` encode the type:
  
    * ``KVM_STATS_TYPE_CUMULATIVE``
-    The statistics data is cumulative. The value of data can only be increased.
+    The statistics reports a cumulative count. The value of data can only be increased.
      Most of the counters used in KVM are of this type.
      The corresponding ``size`` field for this type is always 1.
      All cumulative statistics data are read/write.
    * ``KVM_STATS_TYPE_INSTANT``
-    The statistics data is instantaneous. Its value can be increased or
+    The statistics reports an instantaneous value. Its value can be increased or
      decreased. This type is usually used as a measurement of some resources,
      like the number of dirty pages, the number of large pages, etc.
      All instant statistics are read only.
      The corresponding ``size`` field for this type is always 1.
    * ``KVM_STATS_TYPE_PEAK``
-    The statistics data is peak. The value of data can only be increased, and
-    represents a peak value for a measurement, for example the maximum number
+    The statistics data reports a peak value, for example the maximum number
      of items in a hash table bucket, the longest time waited and so on.
+    The value of data can only be increased.
      The corresponding ``size`` field for this type is always 1.
+  * ``KVM_STATS_TYPE_LINEAR_HIST``
+    The statistic is reported as a linear histogram. The number of
+    buckets is specified by the ``size`` field. The size of buckets is specified
+    by the ``hist_param`` field. The range of the Nth bucket (1 <= N < ``size``)
+    is [``hist_param``*(N-1), ``hist_param``*N), while the range of the last
+    bucket is [``hist_param``*(``size``-1), +INF). (+INF means positive infinity
+    value.) The bucket value indicates how many samples fell in the bucket's range.
+  * ``KVM_STATS_TYPE_LOG_HIST``
+    The statistic is reported as a logarithmic histogram. The number of
+    buckets is specified by the ``size`` field. The range of the first bucket is
+    [0, 1), while the range of the last bucket is [pow(2, ``size``-2), +INF).
+    Otherwise, The Nth bucket (1 < N < ``size``) covers
+    [pow(2, N-2), pow(2, N-1)). The bucket value indicates how many samples fell
+    in the bucket's range.
  
  Bits 4-7 of ``flags`` encode the unit:
  
@@ -5286,9 +5306,9 @@ unsigned 64bit data.
  The ``offset`` field is the offset from the start of Data Block to the start of
  the corresponding statistics data.
  
-The ``unused`` field is reserved for future support for other types of
-statistics data, like log/linear histogram. Its value is always 0 for the types
-defined above.
+The ``bucket_size`` field is used as a parameter for histogram statistics data.
+It is only used by linear histogram statistics data, specifying the size of a
+bucket.
  
  The ``name`` field is the name string of the statistics data. The name string
  starts at the end of ``struct kvm_stats_desc``.  The maximum length including
diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst

index 88fa495..5d27da3 100644 (file)
--- a/Documentation/virt/kvm/locking.rst
+++ b/Documentation/virt/kvm/locking.rst
@@ -21,6 +21,12 @@ The acquisition orders for mutexes are as follows:
    can be taken inside a kvm->srcu read-side critical section,
    while kvm->slots_lock cannot.
  
+- kvm->mn_active_invalidate_count ensures that pairs of
+  invalidate_range_start() and invalidate_range_end() callbacks
+  use the same memslots array.  kvm->slots_lock and kvm->slots_arch_lock
+  are taken on the waiting side in install_new_memslots, so MMU notifiers
+  must not take either kvm->slots_lock or kvm->slots_arch_lock.
+
  On x86:
  
  - vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h

index cdfa2a2..ef6be92 100644 (file)
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -602,14 +602,14 @@ static inline bool id_aa64pfr0_32bit_el1(u64 pfr0)
  {
         u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_SHIFT);
  
-       return val == ID_AA64PFR0_EL1_32BIT_64BIT;
+       return val == ID_AA64PFR0_ELx_32BIT_64BIT;
  }
  
  static inline bool id_aa64pfr0_32bit_el0(u64 pfr0)
  {
         u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL0_SHIFT);
  
-       return val == ID_AA64PFR0_EL0_32BIT_64BIT;
+       return val == ID_AA64PFR0_ELx_32BIT_64BIT;
  }
  
  static inline bool id_aa64pfr0_sve(u64 pfr0)
@@ -784,13 +784,13 @@ extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
  static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
  {
         switch (parange) {
-       case 0: return 32;
-       case 1: return 36;
-       case 2: return 40;
-       case 3: return 42;
-       case 4: return 44;
-       case 5: return 48;
-       case 6: return 52;
+       case ID_AA64MMFR0_PARANGE_32: return 32;
+       case ID_AA64MMFR0_PARANGE_36: return 36;
+       case ID_AA64MMFR0_PARANGE_40: return 40;
+       case ID_AA64MMFR0_PARANGE_42: return 42;
+       case ID_AA64MMFR0_PARANGE_44: return 44;
+       case ID_AA64MMFR0_PARANGE_48: return 48;
+       case ID_AA64MMFR0_PARANGE_52: return 52;
         /*
          * A future PE could use a value unknown to the kernel.
          * However, by the "D10.1.4 Principles of the ID scheme
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h

index d436831..327120c 100644 (file)
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -12,8 +12,13 @@
  #include <asm/types.h>
  
  /* Hyp Configuration Register (HCR) bits */
+
+#define HCR_TID5       (UL(1) << 58)
+#define HCR_DCT                (UL(1) << 57)
  #define HCR_ATA_SHIFT  56
  #define HCR_ATA                (UL(1) << HCR_ATA_SHIFT)
+#define HCR_AMVOFFEN   (UL(1) << 51)
+#define HCR_FIEN       (UL(1) << 47)
  #define HCR_FWB                (UL(1) << 46)
  #define HCR_API                (UL(1) << 41)
  #define HCR_APK                (UL(1) << 40)
@@ -32,9 +37,9 @@
  #define HCR_TVM                (UL(1) << 26)
  #define HCR_TTLB       (UL(1) << 25)
  #define HCR_TPU                (UL(1) << 24)
-#define HCR_TPC                (UL(1) << 23)
+#define HCR_TPC                (UL(1) << 23) /* HCR_TPCP if FEAT_DPB */
  #define HCR_TSW                (UL(1) << 22)
-#define HCR_TAC                (UL(1) << 21)
+#define HCR_TACR       (UL(1) << 21)
  #define HCR_TIDCP      (UL(1) << 20)
  #define HCR_TSC                (UL(1) << 19)
  #define HCR_TID3       (UL(1) << 18)
@@ -56,12 +61,13 @@
  #define HCR_PTW                (UL(1) << 2)
  #define HCR_SWIO       (UL(1) << 1)
  #define HCR_VM         (UL(1) << 0)
+#define HCR_RES0       ((UL(1) << 48) | (UL(1) << 39))
  
  /*
   * The bits we set in HCR:
   * TLOR:       Trap LORegion register accesses
   * RW:         64bit by default, can be overridden for 32bit VMs
- * TAC:                Trap ACTLR
+ * TACR:       Trap ACTLR
   * TSC:                Trap SMC
   * TSW:                Trap cache operations by set/way
   * TWE:                Trap WFE
@@ -76,7 +82,7 @@
   * PTW:                Take a stage2 fault if a stage1 walk steps in device memory
   */
  #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
-                        HCR_BSU_IS | HCR_FB | HCR_TAC | \
+                        HCR_BSU_IS | HCR_FB | HCR_TACR | \
                          HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \
                          HCR_FMO | HCR_IMO | HCR_PTW )
  #define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
@@ -275,24 +281,40 @@
  #define CPTR_EL2_TTA   (1 << 20)
  #define CPTR_EL2_TFP   (1 << CPTR_EL2_TFP_SHIFT)
  #define CPTR_EL2_TZ    (1 << 8)
-#define CPTR_EL2_RES1  0x000032ff /* known RES1 bits in CPTR_EL2 */
-#define CPTR_EL2_DEFAULT       CPTR_EL2_RES1
+#define CPTR_NVHE_EL2_RES1     0x000032ff /* known RES1 bits in CPTR_EL2 (nVHE) */
+#define CPTR_EL2_DEFAULT       CPTR_NVHE_EL2_RES1
+#define CPTR_NVHE_EL2_RES0     (GENMASK(63, 32) |      \
+                                GENMASK(29, 21) |      \
+                                GENMASK(19, 14) |      \
+                                BIT(11))
  
  /* Hyp Debug Configuration Register bits */
  #define MDCR_EL2_E2TB_MASK     (UL(0x3))
  #define MDCR_EL2_E2TB_SHIFT    (UL(24))
-#define MDCR_EL2_TTRF          (1 << 19)
-#define MDCR_EL2_TPMS          (1 << 14)
+#define MDCR_EL2_HPMFZS                (UL(1) << 36)
+#define MDCR_EL2_HPMFZO                (UL(1) << 29)
+#define MDCR_EL2_MTPME         (UL(1) << 28)
+#define MDCR_EL2_TDCC          (UL(1) << 27)
+#define MDCR_EL2_HCCD          (UL(1) << 23)
+#define MDCR_EL2_TTRF          (UL(1) << 19)
+#define MDCR_EL2_HPMD          (UL(1) << 17)
+#define MDCR_EL2_TPMS          (UL(1) << 14)
  #define MDCR_EL2_E2PB_MASK     (UL(0x3))
  #define MDCR_EL2_E2PB_SHIFT    (UL(12))
-#define MDCR_EL2_TDRA          (1 << 11)
-#define MDCR_EL2_TDOSA         (1 << 10)
-#define MDCR_EL2_TDA           (1 << 9)
-#define MDCR_EL2_TDE           (1 << 8)
-#define MDCR_EL2_HPME          (1 << 7)
-#define MDCR_EL2_TPM           (1 << 6)
-#define MDCR_EL2_TPMCR         (1 << 5)
-#define MDCR_EL2_HPMN_MASK     (0x1F)
+#define MDCR_EL2_TDRA          (UL(1) << 11)
+#define MDCR_EL2_TDOSA         (UL(1) << 10)
+#define MDCR_EL2_TDA           (UL(1) << 9)
+#define MDCR_EL2_TDE           (UL(1) << 8)
+#define MDCR_EL2_HPME          (UL(1) << 7)
+#define MDCR_EL2_TPM           (UL(1) << 6)
+#define MDCR_EL2_TPMCR         (UL(1) << 5)
+#define MDCR_EL2_HPMN_MASK     (UL(0x1F))
+#define MDCR_EL2_RES0          (GENMASK(63, 37) |      \
+                                GENMASK(35, 30) |      \
+                                GENMASK(25, 24) |      \
+                                GENMASK(22, 20) |      \
+                                BIT(18) |              \
+                                GENMASK(16, 15))
  
  /* For compatibility with fault code shared with 32-bit */
  #define FSC_FAULT      ESR_ELx_FSC_FAULT
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h

index 9f0bf21..e86045a 100644 (file)
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -59,12 +59,11 @@
  #define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs              13
  #define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs           14
  #define __KVM_HOST_SMCCC_FUNC___pkvm_init                      15
-#define __KVM_HOST_SMCCC_FUNC___pkvm_create_mappings           16
+#define __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp            16
  #define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping    17
  #define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector            18
  #define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize             19
-#define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp                  20
-#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc                  21
+#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc                  20
  
  #ifndef __ASSEMBLY__
  
@@ -210,7 +209,7 @@ extern u64 __vgic_v3_read_vmcr(void);
  extern void __vgic_v3_write_vmcr(u32 vmcr);
  extern void __vgic_v3_init_lrs(void);
  
-extern u32 __kvm_get_mdcr_el2(void);
+extern u64 __kvm_get_mdcr_el2(void);
  
  #define __KVM_EXTABLE(from, to)                                                \
         "       .pushsection    __kvm_ex_table, \"a\"\n"                \
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h

index 4191158..f8be56d 100644 (file)
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -66,7 +66,7 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
  extern unsigned int kvm_sve_max_vl;
  int kvm_arm_init_sve(void);
  
-int __attribute_const__ kvm_target_cpu(void);
+u32 __attribute_const__ kvm_target_cpu(void);
  int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
  void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu);
  
@@ -185,7 +185,6 @@ enum vcpu_sysreg {
         PMCNTENSET_EL0, /* Count Enable Set Register */
         PMINTENSET_EL1, /* Interrupt Enable Set Register */
         PMOVSSET_EL0,   /* Overflow Flag Status Set Register */
-       PMSWINC_EL0,    /* Software Increment Register */
         PMUSERENR_EL0,  /* User Enable Register */
  
         /* Pointer Authentication Registers in a strict increasing order. */
@@ -287,9 +286,13 @@ struct kvm_vcpu_arch {
         /* Stage 2 paging state used by the hardware on next switch */
         struct kvm_s2_mmu *hw_mmu;
  
-       /* HYP configuration */
+       /* Values of trap registers for the guest. */
         u64 hcr_el2;
-       u32 mdcr_el2;
+       u64 mdcr_el2;
+       u64 cptr_el2;
+
+       /* Values of trap registers for the host before guest entry. */
+       u64 mdcr_el2_host;
  
         /* Exception Information */
         struct kvm_vcpu_fault_info fault;
@@ -576,6 +579,7 @@ struct kvm_vcpu_stat {
         u64 wfi_exit_stat;
         u64 mmio_exit_user;
         u64 mmio_exit_kernel;
+       u64 signal_exits;
         u64 exits;
  };
  
@@ -771,6 +775,11 @@ void kvm_arch_free_vm(struct kvm *kvm);
  
  int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type);
  
+static inline bool kvm_vm_is_protected(struct kvm *kvm)
+{
+       return false;
+}
+
  int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature);
  bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
  
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h

index 9d60b30..657d0c9 100644 (file)
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -95,7 +95,7 @@ void __sve_restore_state(void *sve_pffr, u32 *fpsr);
  
  #ifndef __KVM_NVHE_HYPERVISOR__
  void activate_traps_vhe_load(struct kvm_vcpu *vcpu);
-void deactivate_traps_vhe_put(void);
+void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu);
  #endif
  
  u64 __guest_enter(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h

index b52c5c4..02d3788 100644 (file)
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -252,6 +252,11 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
  
  #define kvm_phys_to_vttbr(addr)                phys_to_ttbr(addr)
  
+/*
+ * When this is (directly or indirectly) used on the TLB invalidation
+ * path, we rely on a previously issued DSB so that page table updates
+ * and VMID reads are correctly ordered.
+ */
  static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
  {
         struct kvm_vmid *vmid = &mmu->vmid;
@@ -259,7 +264,7 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
         u64 cnp = system_supports_cnp() ? VTTBR_CNP_BIT : 0;
  
         baddr = mmu->pgd_phys;
-       vmid_field = (u64)vmid->vmid << VTTBR_VMID_SHIFT;
+       vmid_field = (u64)READ_ONCE(vmid->vmid) << VTTBR_VMID_SHIFT;
         return kvm_phys_to_vttbr(baddr) | vmid_field | cnp;
  }
  
@@ -267,9 +272,10 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
   * Must be called from hyp code running at EL2 with an updated VTTBR
   * and interrupts disabled.
   */
-static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long vtcr)
+static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu,
+                                         struct kvm_arch *arch)
  {
-       write_sysreg(vtcr, vtcr_el2);
+       write_sysreg(arch->vtcr, vtcr_el2);
         write_sysreg(kvm_get_vttbr(mmu), vttbr_el2);
  
         /*
@@ -280,11 +286,6 @@ static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long
         asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
  }
  
-static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu)
-{
-       __load_stage2(mmu, kern_hyp_va(mmu->arch)->vtcr);
-}
-
  static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu)
  {
         return container_of(mmu->arch, struct kvm, arch);
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h

index f004c01..0277838 100644 (file)
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -25,6 +25,46 @@ static inline u64 kvm_get_parange(u64 mmfr0)
  
  typedef u64 kvm_pte_t;
  
+#define KVM_PTE_VALID                  BIT(0)
+
+#define KVM_PTE_ADDR_MASK              GENMASK(47, PAGE_SHIFT)
+#define KVM_PTE_ADDR_51_48             GENMASK(15, 12)
+
+static inline bool kvm_pte_valid(kvm_pte_t pte)
+{
+       return pte & KVM_PTE_VALID;
+}
+
+static inline u64 kvm_pte_to_phys(kvm_pte_t pte)
+{
+       u64 pa = pte & KVM_PTE_ADDR_MASK;
+
+       if (PAGE_SHIFT == 16)
+               pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
+
+       return pa;
+}
+
+static inline u64 kvm_granule_shift(u32 level)
+{
+       /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
+       return ARM64_HW_PGTABLE_LEVEL_SHIFT(level);
+}
+
+static inline u64 kvm_granule_size(u32 level)
+{
+       return BIT(kvm_granule_shift(level));
+}
+
+static inline bool kvm_level_supports_block_mapping(u32 level)
+{
+       /*
+        * Reject invalid block mappings and don't bother with 4TB mappings for
+        * 52-bit PAs.
+        */
+       return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1));
+}
+
  /**
   * struct kvm_pgtable_mm_ops - Memory management callbacks.
   * @zalloc_page:               Allocate a single zeroed memory page.
@@ -75,31 +115,16 @@ enum kvm_pgtable_stage2_flags {
         KVM_PGTABLE_S2_IDMAP                    = BIT(1),
  };
  
-/**
- * struct kvm_pgtable - KVM page-table.
- * @ia_bits:           Maximum input address size, in bits.
- * @start_level:       Level at which the page-table walk starts.
- * @pgd:               Pointer to the first top-level entry of the page-table.
- * @mm_ops:            Memory management callbacks.
- * @mmu:               Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
- */
-struct kvm_pgtable {
-       u32                                     ia_bits;
-       u32                                     start_level;
-       kvm_pte_t                               *pgd;
-       struct kvm_pgtable_mm_ops               *mm_ops;
-
-       /* Stage-2 only */
-       struct kvm_s2_mmu                       *mmu;
-       enum kvm_pgtable_stage2_flags           flags;
-};
-
  /**
   * enum kvm_pgtable_prot - Page-table permissions and attributes.
   * @KVM_PGTABLE_PROT_X:                Execute permission.
   * @KVM_PGTABLE_PROT_W:                Write permission.
   * @KVM_PGTABLE_PROT_R:                Read permission.
   * @KVM_PGTABLE_PROT_DEVICE:   Device attributes.
+ * @KVM_PGTABLE_PROT_SW0:      Software bit 0.
+ * @KVM_PGTABLE_PROT_SW1:      Software bit 1.
+ * @KVM_PGTABLE_PROT_SW2:      Software bit 2.
+ * @KVM_PGTABLE_PROT_SW3:      Software bit 3.
   */
  enum kvm_pgtable_prot {
         KVM_PGTABLE_PROT_X                      = BIT(0),
@@ -107,21 +132,48 @@ enum kvm_pgtable_prot {
         KVM_PGTABLE_PROT_R                      = BIT(2),
  
         KVM_PGTABLE_PROT_DEVICE                 = BIT(3),
+
+       KVM_PGTABLE_PROT_SW0                    = BIT(55),
+       KVM_PGTABLE_PROT_SW1                    = BIT(56),
+       KVM_PGTABLE_PROT_SW2                    = BIT(57),
+       KVM_PGTABLE_PROT_SW3                    = BIT(58),
  };
  
-#define PAGE_HYP               (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
+#define KVM_PGTABLE_PROT_RW    (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
+#define KVM_PGTABLE_PROT_RWX   (KVM_PGTABLE_PROT_RW | KVM_PGTABLE_PROT_X)
+
+#define PKVM_HOST_MEM_PROT     KVM_PGTABLE_PROT_RWX
+#define PKVM_HOST_MMIO_PROT    KVM_PGTABLE_PROT_RW
+
+#define PAGE_HYP               KVM_PGTABLE_PROT_RW
  #define PAGE_HYP_EXEC          (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X)
  #define PAGE_HYP_RO            (KVM_PGTABLE_PROT_R)
  #define PAGE_HYP_DEVICE                (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)
  
+typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
+                                          enum kvm_pgtable_prot prot);
+
  /**
- * struct kvm_mem_range - Range of Intermediate Physical Addresses
- * @start:     Start of the range.
- * @end:       End of the range.
+ * struct kvm_pgtable - KVM page-table.
+ * @ia_bits:           Maximum input address size, in bits.
+ * @start_level:       Level at which the page-table walk starts.
+ * @pgd:               Pointer to the first top-level entry of the page-table.
+ * @mm_ops:            Memory management callbacks.
+ * @mmu:               Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
+ * @flags:             Stage-2 page-table flags.
+ * @force_pte_cb:      Function that returns true if page level mappings must
+ *                     be used instead of block mappings.
   */
-struct kvm_mem_range {
-       u64 start;
-       u64 end;
+struct kvm_pgtable {
+       u32                                     ia_bits;
+       u32                                     start_level;
+       kvm_pte_t                               *pgd;
+       struct kvm_pgtable_mm_ops               *mm_ops;
+
+       /* Stage-2 only */
+       struct kvm_s2_mmu                       *mmu;
+       enum kvm_pgtable_stage2_flags           flags;
+       kvm_pgtable_force_pte_cb_t              force_pte_cb;
  };
  
  /**
@@ -216,21 +268,24 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
  u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift);
  
  /**
- * kvm_pgtable_stage2_init_flags() - Initialise a guest stage-2 page-table.
+ * __kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table.
   * @pgt:       Uninitialised page-table structure to initialise.
   * @arch:      Arch-specific KVM structure representing the guest virtual
   *             machine.
   * @mm_ops:    Memory management callbacks.
   * @flags:     Stage-2 configuration flags.
+ * @force_pte_cb: Function that returns true if page level mappings must
+ *             be used instead of block mappings.
   *
   * Return: 0 on success, negative error code on failure.
   */
-int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch,
-                                 struct kvm_pgtable_mm_ops *mm_ops,
-                                 enum kvm_pgtable_stage2_flags flags);
+int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_arch *arch,
+                             struct kvm_pgtable_mm_ops *mm_ops,
+                             enum kvm_pgtable_stage2_flags flags,
+                             kvm_pgtable_force_pte_cb_t force_pte_cb);
  
  #define kvm_pgtable_stage2_init(pgt, arch, mm_ops) \
-       kvm_pgtable_stage2_init_flags(pgt, arch, mm_ops, 0)
+       __kvm_pgtable_stage2_init(pgt, arch, mm_ops, 0, NULL)
  
  /**
   * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table.
@@ -374,7 +429,8 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
   * If there is a valid, leaf page-table entry used to translate @addr, then
   * relax the permissions in that entry according to the read, write and
   * execute permissions specified by @prot. No permissions are removed, and
- * TLB invalidation is performed after updating the entry.
+ * TLB invalidation is performed after updating the entry. Software bits cannot
+ * be set or cleared using kvm_pgtable_stage2_relax_perms().
   *
   * Return: 0 on success, negative error code on failure.
   */
@@ -433,22 +489,42 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
                      struct kvm_pgtable_walker *walker);
  
  /**
- * kvm_pgtable_stage2_find_range() - Find a range of Intermediate Physical
- *                                  Addresses with compatible permission
- *                                  attributes.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
- * @addr:      Address that must be covered by the range.
- * @prot:      Protection attributes that the range must be compatible with.
- * @range:     Range structure used to limit the search space at call time and
- *             that will hold the result.
+ * kvm_pgtable_get_leaf() - Walk a page-table and retrieve the leaf entry
+ *                         with its level.
+ * @pgt:       Page-table structure initialised by kvm_pgtable_*_init()
+ *             or a similar initialiser.
+ * @addr:      Input address for the start of the walk.
+ * @ptep:      Pointer to storage for the retrieved PTE.
+ * @level:     Pointer to storage for the level of the retrieved PTE.
+ *
+ * The offset of @addr within a page is ignored.
   *
- * The offset of @addr within a page is ignored. An IPA is compatible with @prot
- * iff its corresponding stage-2 page-table entry has default ownership and, if
- * valid, is mapped with protection attributes identical to @prot.
+ * The walker will walk the page-table entries corresponding to the input
+ * address specified, retrieving the leaf corresponding to this address.
+ * Invalid entries are treated as leaf entries.
   *
   * Return: 0 on success, negative error code on failure.
   */
-int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr,
-                                 enum kvm_pgtable_prot prot,
-                                 struct kvm_mem_range *range);
+int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
+                        kvm_pte_t *ptep, u32 *level);
+
+/**
+ * kvm_pgtable_stage2_pte_prot() - Retrieve the protection attributes of a
+ *                                stage-2 Page-Table Entry.
+ * @pte:       Page-table entry
+ *
+ * Return: protection attributes of the page-table entry in the enum
+ *        kvm_pgtable_prot format.
+ */
+enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte);
+
+/**
+ * kvm_pgtable_hyp_pte_prot() - Retrieve the protection attributes of a stage-1
+ *                             Page-Table Entry.
+ * @pte:       Page-table entry
+ *
+ * Return: protection attributes of the page-table entry in the enum
+ *        kvm_pgtable_prot format.
+ */
+enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte);
  #endif /* __ARM64_KVM_PGTABLE_H__ */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h

index f2e06e7..b268082 100644 (file)
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -784,14 +784,13 @@
  #define ID_AA64PFR0_AMU                        0x1
  #define ID_AA64PFR0_SVE                        0x1
  #define ID_AA64PFR0_RAS_V1             0x1
+#define ID_AA64PFR0_RAS_V1P1           0x2
  #define ID_AA64PFR0_FP_NI              0xf
  #define ID_AA64PFR0_FP_SUPPORTED       0x0
  #define ID_AA64PFR0_ASIMD_NI           0xf
  #define ID_AA64PFR0_ASIMD_SUPPORTED    0x0
-#define ID_AA64PFR0_EL1_64BIT_ONLY     0x1
-#define ID_AA64PFR0_EL1_32BIT_64BIT    0x2
-#define ID_AA64PFR0_EL0_64BIT_ONLY     0x1
-#define ID_AA64PFR0_EL0_32BIT_64BIT    0x2
+#define ID_AA64PFR0_ELx_64BIT_ONLY     0x1
+#define ID_AA64PFR0_ELx_32BIT_64BIT    0x2
  
  /* id_aa64pfr1 */
  #define ID_AA64PFR1_MPAMFRAC_SHIFT     16
@@ -847,6 +846,9 @@
  #define ID_AA64MMFR0_ASID_SHIFT                4
  #define ID_AA64MMFR0_PARANGE_SHIFT     0
  
+#define ID_AA64MMFR0_ASID_8            0x0
+#define ID_AA64MMFR0_ASID_16           0x2
+
  #define ID_AA64MMFR0_TGRAN4_NI                 0xf
  #define ID_AA64MMFR0_TGRAN4_SUPPORTED_MIN      0x0
  #define ID_AA64MMFR0_TGRAN4_SUPPORTED_MAX      0x7
@@ -857,9 +859,16 @@
  #define ID_AA64MMFR0_TGRAN16_SUPPORTED_MIN     0x1
  #define ID_AA64MMFR0_TGRAN16_SUPPORTED_MAX     0xf
  
+#define ID_AA64MMFR0_PARANGE_32                0x0
+#define ID_AA64MMFR0_PARANGE_36                0x1
+#define ID_AA64MMFR0_PARANGE_40                0x2
+#define ID_AA64MMFR0_PARANGE_42                0x3
+#define ID_AA64MMFR0_PARANGE_44                0x4
  #define ID_AA64MMFR0_PARANGE_48                0x5
  #define ID_AA64MMFR0_PARANGE_52                0x6
  
+#define ARM64_MIN_PARANGE_BITS         32
+
  #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_DEFAULT 0x0
  #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE    0x1
  #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_MIN     0x2
@@ -904,6 +913,7 @@
  #define ID_AA64MMFR2_CNP_SHIFT         0
  
  /* id_aa64dfr0 */
+#define ID_AA64DFR0_MTPMU_SHIFT                48
  #define ID_AA64DFR0_TRBE_SHIFT         44
  #define ID_AA64DFR0_TRACE_FILT_SHIFT   40
  #define ID_AA64DFR0_DOUBLELOCK_SHIFT   36
@@ -1034,14 +1044,17 @@
  #define ID_AA64MMFR0_TGRAN_SHIFT               ID_AA64MMFR0_TGRAN4_SHIFT
  #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN       ID_AA64MMFR0_TGRAN4_SUPPORTED_MIN
  #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX       ID_AA64MMFR0_TGRAN4_SUPPORTED_MAX
+#define ID_AA64MMFR0_TGRAN_2_SHIFT             ID_AA64MMFR0_TGRAN4_2_SHIFT
  #elif defined(CONFIG_ARM64_16K_PAGES)
  #define ID_AA64MMFR0_TGRAN_SHIFT               ID_AA64MMFR0_TGRAN16_SHIFT
  #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN       ID_AA64MMFR0_TGRAN16_SUPPORTED_MIN
  #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX       ID_AA64MMFR0_TGRAN16_SUPPORTED_MAX
+#define ID_AA64MMFR0_TGRAN_2_SHIFT             ID_AA64MMFR0_TGRAN16_2_SHIFT
  #elif defined(CONFIG_ARM64_64K_PAGES)
  #define ID_AA64MMFR0_TGRAN_SHIFT               ID_AA64MMFR0_TGRAN64_SHIFT
  #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN       ID_AA64MMFR0_TGRAN64_SUPPORTED_MIN
  #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX       ID_AA64MMFR0_TGRAN64_SUPPORTED_MAX
+#define ID_AA64MMFR0_TGRAN_2_SHIFT             ID_AA64MMFR0_TGRAN64_2_SHIFT
  #endif
  
  #define MVFR2_FPMISC_SHIFT             4
@@ -1172,6 +1185,11 @@
  #define ICH_VTR_A3V_SHIFT      21
  #define ICH_VTR_A3V_MASK       (1 << ICH_VTR_A3V_SHIFT)
  
+#define ARM64_FEATURE_FIELD_BITS       4
+
+/* Create a mask for the feature bits of the specified feature. */
+#define ARM64_FEATURE_MASK(x)  (GENMASK_ULL(x##_SHIFT + ARM64_FEATURE_FIELD_BITS - 1, x##_SHIFT))
+
  #ifdef __ASSEMBLY__
  
         .irp    num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c

index b2770d7..f8a3067 100644 (file)
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -240,8 +240,8 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
         S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI),
         ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL3_SHIFT, 4, 0),
         ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL2_SHIFT, 4, 0),
-       ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_EL1_64BIT_ONLY),
-       ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_EL0_64BIT_ONLY),
+       ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_ELx_64BIT_ONLY),
+       ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_ELx_64BIT_ONLY),
         ARM64_FTR_END,
  };
  
@@ -1983,7 +1983,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
                 .sys_reg = SYS_ID_AA64PFR0_EL1,
                 .sign = FTR_UNSIGNED,
                 .field_pos = ID_AA64PFR0_EL0_SHIFT,
-               .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT,
+               .min_field_value = ID_AA64PFR0_ELx_32BIT_64BIT,
         },
  #ifdef CONFIG_KVM
         {
@@ -1994,7 +1994,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
                 .sys_reg = SYS_ID_AA64PFR0_EL1,
                 .sign = FTR_UNSIGNED,
                 .field_pos = ID_AA64PFR0_EL1_SHIFT,
-               .min_field_value = ID_AA64PFR0_EL1_32BIT_64BIT,
+               .min_field_value = ID_AA64PFR0_ELx_32BIT_64BIT,
         },
         {
                 .desc = "Protected KVM",
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S

index 709d2c4..f6b1a88 100644 (file)
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -181,6 +181,8 @@ SECTIONS
         /* everything from this point to __init_begin will be marked RO NX */
         RO_DATA(PAGE_SIZE)
  
+       HYPERVISOR_DATA_SECTIONS
+
         idmap_pg_dir = .;
         . += IDMAP_DIR_SIZE;
         idmap_pg_end = .;
@@ -260,8 +262,6 @@ SECTIONS
         _sdata = .;
         RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN)
  
-       HYPERVISOR_DATA_SECTIONS
-
         /*
          * Data written with the MMU off but read with the MMU on requires
          * cache lines to be invalidated, discarding up to a Cache Writeback
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig

index a4eba09..d7eec0b 100644 (file)
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -26,6 +26,7 @@ menuconfig KVM
         select HAVE_KVM_ARCH_TLB_FLUSH_ALL
         select KVM_MMIO
         select KVM_GENERIC_DIRTYLOG_READ_PROTECT
+       select KVM_XFER_TO_GUEST_WORK
         select SRCU
         select KVM_VFIO
         select HAVE_KVM_EVENTFD
@@ -46,6 +47,15 @@ if KVM
  
  source "virt/kvm/Kconfig"
  
+config NVHE_EL2_DEBUG
+       bool "Debug mode for non-VHE EL2 object"
+       help
+         Say Y here to enable the debug mode for the non-VHE KVM EL2 object.
+         Failure reports will BUG() in the hypervisor. This is intended for
+         local EL2 hypervisor development.
+
+         If unsure, say N.
+
  endif # KVM
  
  endif # VIRTUALIZATION
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c

index 0ca72f5..fe102cd 100644 (file)
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -6,6 +6,7 @@
  
  #include <linux/bug.h>
  #include <linux/cpu_pm.h>
+#include <linux/entry-kvm.h>
  #include <linux/errno.h>
  #include <linux/err.h>
  #include <linux/kvm_host.h>
@@ -15,6 +16,7 @@
  #include <linux/fs.h>
  #include <linux/mman.h>
  #include <linux/sched.h>
+#include <linux/kmemleak.h>
  #include <linux/kvm.h>
  #include <linux/kvm_irqfd.h>
  #include <linux/irqbypass.h>
@@ -42,10 +44,6 @@
  #include <kvm/arm_pmu.h>
  #include <kvm/arm_psci.h>
  
-#ifdef REQUIRES_VIRT
-__asm__(".arch_extension       virt");
-#endif
-
  static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT;
  DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
  
@@ -575,7 +573,7 @@ static void update_vmid(struct kvm_vmid *vmid)
                 kvm_call_hyp(__kvm_flush_vm_context);
         }
  
-       vmid->vmid = kvm_next_vmid;
+       WRITE_ONCE(vmid->vmid, kvm_next_vmid);
         kvm_next_vmid++;
         kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1;
  
@@ -718,6 +716,45 @@ static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu)
                 static_branch_unlikely(&arm64_mismatched_32bit_el0);
  }
  
+/**
+ * kvm_vcpu_exit_request - returns true if the VCPU should *not* enter the guest
+ * @vcpu:      The VCPU pointer
+ * @ret:       Pointer to write optional return code
+ *
+ * Returns: true if the VCPU needs to return to a preemptible + interruptible
+ *         and skip guest entry.
+ *
+ * This function disambiguates between two different types of exits: exits to a
+ * preemptible + interruptible kernel context and exits to userspace. For an
+ * exit to userspace, this function will write the return code to ret and return
+ * true. For an exit to preemptible + interruptible kernel context (i.e. check
+ * for pending work and re-enter), return true without writing to ret.
+ */
+static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret)
+{
+       struct kvm_run *run = vcpu->run;
+
+       /*
+        * If we're using a userspace irqchip, then check if we need
+        * to tell a userspace irqchip about timer or PMU level
+        * changes and if so, exit to userspace (the actual level
+        * state gets updated in kvm_timer_update_run and
+        * kvm_pmu_update_run below).
+        */
+       if (static_branch_unlikely(&userspace_irqchip_in_use)) {
+               if (kvm_timer_should_notify_user(vcpu) ||
+                   kvm_pmu_should_notify_user(vcpu)) {
+                       *ret = -EINTR;
+                       run->exit_reason = KVM_EXIT_INTR;
+                       return true;
+               }
+       }
+
+       return kvm_request_pending(vcpu) ||
+                       need_new_vmid_gen(&vcpu->arch.hw_mmu->vmid) ||
+                       xfer_to_guest_mode_work_pending();
+}
+
  /**
   * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
   * @vcpu:      The VCPU pointer
@@ -761,7 +798,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                 /*
                  * Check conditions before entering the guest
                  */
-               cond_resched();
+               ret = xfer_to_guest_mode_handle_work(vcpu);
+               if (!ret)
+                       ret = 1;
  
                 update_vmid(&vcpu->arch.hw_mmu->vmid);
  
@@ -780,30 +819,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
  
                 kvm_vgic_flush_hwstate(vcpu);
  
-               /*
-                * Exit if we have a signal pending so that we can deliver the
-                * signal to user space.
-                */
-               if (signal_pending(current)) {
-                       ret = -EINTR;
-                       run->exit_reason = KVM_EXIT_INTR;
-               }
-
-               /*
-                * If we're using a userspace irqchip, then check if we need
-                * to tell a userspace irqchip about timer or PMU level
-                * changes and if so, exit to userspace (the actual level
-                * state gets updated in kvm_timer_update_run and
-                * kvm_pmu_update_run below).
-                */
-               if (static_branch_unlikely(&userspace_irqchip_in_use)) {
-                       if (kvm_timer_should_notify_user(vcpu) ||
-                           kvm_pmu_should_notify_user(vcpu)) {
-                               ret = -EINTR;
-                               run->exit_reason = KVM_EXIT_INTR;
-                       }
-               }
-
                 /*
                  * Ensure we set mode to IN_GUEST_MODE after we disable
                  * interrupts and before the final VCPU requests check.
@@ -812,8 +827,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                  */
                 smp_store_mb(vcpu->mode, IN_GUEST_MODE);
  
-               if (ret <= 0 || need_new_vmid_gen(&vcpu->arch.hw_mmu->vmid) ||
-                   kvm_request_pending(vcpu)) {
+               if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) {
                         vcpu->mode = OUTSIDE_GUEST_MODE;
                         isb(); /* Ensure work in x_flush_hwstate is committed */
                         kvm_pmu_sync_hwstate(vcpu);
@@ -1039,7 +1053,7 @@ static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
                                const struct kvm_vcpu_init *init)
  {
         unsigned int i, ret;
-       int phys_target = kvm_target_cpu();
+       u32 phys_target = kvm_target_cpu();
  
         if (init->target != phys_target)
                 return -EINVAL;
@@ -1108,6 +1122,7 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
         }
  
         vcpu_reset_hcr(vcpu);
+       vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT;
  
         /*
          * Handle the "start in power-off" case.
@@ -1219,6 +1234,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                 if (copy_from_user(&reg, argp, sizeof(reg)))
                         break;
  
+               /*
+                * We could owe a reset due to PSCI. Handle the pending reset
+                * here to ensure userspace register accesses are ordered after
+                * the reset.
+                */
+               if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
+                       kvm_reset_vcpu(vcpu);
+
                 if (ioctl == KVM_SET_ONE_REG)
                         r = kvm_arm_set_reg(vcpu, &reg);
                 else
@@ -1700,11 +1723,6 @@ static bool init_psci_relay(void)
         return true;
  }
  
-static int init_common_resources(void)
-{
-       return kvm_set_ipa_limit();
-}
-
  static int init_subsystems(void)
  {
         int err = 0;
@@ -1958,56 +1976,17 @@ static void _kvm_host_prot_finalize(void *discard)
         WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize));
  }
  
-static inline int pkvm_mark_hyp(phys_addr_t start, phys_addr_t end)
-{
-       return kvm_call_hyp_nvhe(__pkvm_mark_hyp, start, end);
-}
-
-#define pkvm_mark_hyp_section(__section)               \
-       pkvm_mark_hyp(__pa_symbol(__section##_start),   \
-                       __pa_symbol(__section##_end))
-
  static int finalize_hyp_mode(void)
  {
-       int cpu, ret;
-
         if (!is_protected_kvm_enabled())
                 return 0;
  
-       ret = pkvm_mark_hyp_section(__hyp_idmap_text);
-       if (ret)
-               return ret;
-
-       ret = pkvm_mark_hyp_section(__hyp_text);
-       if (ret)
-               return ret;
-
-       ret = pkvm_mark_hyp_section(__hyp_rodata);
-       if (ret)
-               return ret;
-
-       ret = pkvm_mark_hyp_section(__hyp_bss);
-       if (ret)
-               return ret;
-
-       ret = pkvm_mark_hyp(hyp_mem_base, hyp_mem_base + hyp_mem_size);
-       if (ret)
-               return ret;
-
-       for_each_possible_cpu(cpu) {
-               phys_addr_t start = virt_to_phys((void *)kvm_arm_hyp_percpu_base[cpu]);
-               phys_addr_t end = start + (PAGE_SIZE << nvhe_percpu_order());
-
-               ret = pkvm_mark_hyp(start, end);
-               if (ret)
-                       return ret;
-
-               start = virt_to_phys((void *)per_cpu(kvm_arm_hyp_stack_page, cpu));
-               end = start + PAGE_SIZE;
-               ret = pkvm_mark_hyp(start, end);
-               if (ret)
-                       return ret;
-       }
+       /*
+        * Exclude HYP BSS from kmemleak so that it doesn't get peeked
+        * at, which would end badly once the section is inaccessible.
+        * None of other sections should ever be introspected.
+        */
+       kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start);
  
         /*
          * Flip the static key upfront as that may no longer be possible
@@ -2019,11 +1998,6 @@ static int finalize_hyp_mode(void)
         return 0;
  }
  
-static void check_kvm_target_cpu(void *ret)
-{
-       *(int *)ret = kvm_target_cpu();
-}
-
  struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
  {
         struct kvm_vcpu *vcpu;
@@ -2083,7 +2057,6 @@ void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons)
  int kvm_arch_init(void *opaque)
  {
         int err;
-       int ret, cpu;
         bool in_hyp_mode;
  
         if (!is_hyp_mode_available()) {
@@ -2098,15 +2071,7 @@ int kvm_arch_init(void *opaque)
                 kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \
                          "Only trusted guests should be used on this system.\n");
  
-       for_each_online_cpu(cpu) {
-               smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1);
-               if (ret < 0) {
-                       kvm_err("Error, CPU %d not supported!\n", cpu);
-                       return -ENODEV;
-               }
-       }
-
-       err = init_common_resources();
+       err = kvm_set_ipa_limit();
         if (err)
                 return err;
  
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c

index d5e79d7..db93613 100644 (file)
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -21,7 +21,7 @@
                                 DBG_MDSCR_KDE | \
                                 DBG_MDSCR_MDE)
  
-static DEFINE_PER_CPU(u32, mdcr_el2);
+static DEFINE_PER_CPU(u64, mdcr_el2);
  
  /**
   * save/restore_guest_debug_regs
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c

index 1dfb835..5ce26be 100644 (file)
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -31,8 +31,6 @@
  const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
         KVM_GENERIC_VM_STATS()
  };
-static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
-               sizeof(struct kvm_vm_stat) / sizeof(u64));
  
  const struct kvm_stats_header kvm_vm_stats_header = {
         .name_size = KVM_STATS_NAME_SIZE,
@@ -50,10 +48,9 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
         STATS_DESC_COUNTER(VCPU, wfi_exit_stat),
         STATS_DESC_COUNTER(VCPU, mmio_exit_user),
         STATS_DESC_COUNTER(VCPU, mmio_exit_kernel),
+       STATS_DESC_COUNTER(VCPU, signal_exits),
         STATS_DESC_COUNTER(VCPU, exits)
  };
-static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
-               sizeof(struct kvm_vcpu_stat) / sizeof(u64));
  
  const struct kvm_stats_header kvm_vcpu_stats_header = {
         .name_size = KVM_STATS_NAME_SIZE,
@@ -842,7 +839,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
         return 0;
  }
  
-int __attribute_const__ kvm_target_cpu(void)
+u32 __attribute_const__ kvm_target_cpu(void)
  {
         unsigned long implementor = read_cpuid_implementor();
         unsigned long part_number = read_cpuid_part_number();
@@ -874,7 +871,7 @@ int __attribute_const__ kvm_target_cpu(void)
  
  int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init)
  {
-       int target = kvm_target_cpu();
+       u32 target = kvm_target_cpu();
  
         if (target < 0)
                 return -ENODEV;
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c

index 6f48336..275a273 100644 (file)
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -113,34 +113,20 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu)
   * guest and host are using the same debug facilities it will be up to
   * userspace to re-inject the correct exception for guest delivery.
   *
- * @return: 0 (while setting vcpu->run->exit_reason), -1 for error
+ * @return: 0 (while setting vcpu->run->exit_reason)
   */
  static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu)
  {
         struct kvm_run *run = vcpu->run;
         u32 esr = kvm_vcpu_get_esr(vcpu);
-       int ret = 0;
  
         run->exit_reason = KVM_EXIT_DEBUG;
         run->debug.arch.hsr = esr;
  
-       switch (ESR_ELx_EC(esr)) {
-       case ESR_ELx_EC_WATCHPT_LOW:
+       if (ESR_ELx_EC(esr) == ESR_ELx_EC_WATCHPT_LOW)
                 run->debug.arch.far = vcpu->arch.fault.far_el2;
-               fallthrough;
-       case ESR_ELx_EC_SOFTSTP_LOW:
-       case ESR_ELx_EC_BREAKPT_LOW:
-       case ESR_ELx_EC_BKPT32:
-       case ESR_ELx_EC_BRK64:
-               break;
-       default:
-               kvm_err("%s: un-handled case esr: %#08x\n",
-                       __func__, (unsigned int) esr);
-               ret = -1;
-               break;
-       }
  
-       return ret;
+       return 0;
  }
  
  static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu)
@@ -292,11 +278,12 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index)
                 kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu));
  }
  
-void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr,
+void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
+                                             u64 elr_virt, u64 elr_phys,
                                               u64 par, uintptr_t vcpu,
                                               u64 far, u64 hpfar) {
-       u64 elr_in_kimg = __phys_to_kimg(__hyp_pa(elr));
-       u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr;
+       u64 elr_in_kimg = __phys_to_kimg(elr_phys);
+       u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr_virt;
         u64 mode = spsr & PSR_MODE_MASK;
  
         /*
@@ -309,20 +296,24 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr,
                 kvm_err("Invalid host exception to nVHE hyp!\n");
         } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
                    (esr & ESR_ELx_BRK64_ISS_COMMENT_MASK) == BUG_BRK_IMM) {
-               struct bug_entry *bug = find_bug(elr_in_kimg);
                 const char *file = NULL;
                 unsigned int line = 0;
  
                 /* All hyp bugs, including warnings, are treated as fatal. */
-               if (bug)
-                       bug_get_file_line(bug, &file, &line);
+               if (!is_protected_kvm_enabled() ||
+                   IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) {
+                       struct bug_entry *bug = find_bug(elr_in_kimg);
+
+                       if (bug)
+                               bug_get_file_line(bug, &file, &line);
+               }
  
                 if (file)
                         kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line);
                 else
-                       kvm_err("nVHE hyp BUG at: %016llx!\n", elr + hyp_offset);
+                       kvm_err("nVHE hyp BUG at: %016llx!\n", elr_virt + hyp_offset);
         } else {
-               kvm_err("nVHE hyp panic at: %016llx!\n", elr + hyp_offset);
+               kvm_err("nVHE hyp panic at: %016llx!\n", elr_virt + hyp_offset);
         }
  
         /*
@@ -334,5 +325,5 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr,
         kvm_err("Hyp Offset: 0x%llx\n", hyp_offset);
  
         panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%016lx\n",
-             spsr, elr, esr, far, hpfar, par, vcpu);
+             spsr, elr_virt, esr, far, hpfar, par, vcpu);
  }
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h

index e4a2f29..a0e78a6 100644 (file)
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -92,11 +92,15 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
                 write_sysreg(0, pmselr_el0);
                 write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
         }
+
+       vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2);
         write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
  }
  
-static inline void __deactivate_traps_common(void)
+static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
  {
+       write_sysreg(vcpu->arch.mdcr_el2_host, mdcr_el2);
+
         write_sysreg(0, hstr_el2);
         if (kvm_arm_support_pmu_v3())
                 write_sysreg(0, pmuserenr_el0);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h

index 9c227d8..b58c910 100644 (file)
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -12,6 +12,32 @@
  #include <asm/virt.h>
  #include <nvhe/spinlock.h>
  
+/*
+ * SW bits 0-1 are reserved to track the memory ownership state of each page:
+ *   00: The page is owned exclusively by the page-table owner.
+ *   01: The page is owned by the page-table owner, but is shared
+ *       with another entity.
+ *   10: The page is shared with, but not owned by the page-table owner.
+ *   11: Reserved for future use (lending).
+ */
+enum pkvm_page_state {
+       PKVM_PAGE_OWNED                 = 0ULL,
+       PKVM_PAGE_SHARED_OWNED          = KVM_PGTABLE_PROT_SW0,
+       PKVM_PAGE_SHARED_BORROWED       = KVM_PGTABLE_PROT_SW1,
+};
+
+#define PKVM_PAGE_STATE_PROT_MASK      (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1)
+static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot,
+                                                enum pkvm_page_state state)
+{
+       return (prot & ~PKVM_PAGE_STATE_PROT_MASK) | state;
+}
+
+static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot)
+{
+       return prot & PKVM_PAGE_STATE_PROT_MASK;
+}
+
  struct host_kvm {
         struct kvm_arch arch;
         struct kvm_pgtable pgt;
@@ -20,16 +46,21 @@ struct host_kvm {
  };
  extern struct host_kvm host_kvm;
  
+extern const u8 pkvm_hyp_id;
+
  int __pkvm_prot_finalize(void);
-int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end);
+int __pkvm_host_share_hyp(u64 pfn);
  
+bool addr_is_memory(phys_addr_t phys);
+int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
+int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id);
  int kvm_host_prepare_stage2(void *pgt_pool_base);
  void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
  
  static __always_inline void __load_host_stage2(void)
  {
         if (static_branch_likely(&kvm_protected_mode_initialized))
-               __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr);
+               __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch);
         else
                 write_sysreg(0, vttbr_el2);
  }
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h

index 8ec3a5a..c9a8f53 100644 (file)
--- a/arch/arm64/kvm/hyp/include/nvhe/mm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
@@ -23,8 +23,7 @@ int hyp_map_vectors(void);
  int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back);
  int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot);
  int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
-int __pkvm_create_mappings(unsigned long start, unsigned long size,
-                          unsigned long phys, enum kvm_pgtable_prot prot);
+int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot);
  unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
                                             enum kvm_pgtable_prot prot);
  
diff --git a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h

index 76b537f..4652fd0 100644 (file)
--- a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h
@@ -15,6 +15,7 @@
  
  #include <asm/alternative.h>
  #include <asm/lse.h>
+#include <asm/rwonce.h>
  
  typedef union hyp_spinlock {
         u32     __val;
@@ -89,4 +90,28 @@ static inline void hyp_spin_unlock(hyp_spinlock_t *lock)
         : "memory");
  }
  
+static inline bool hyp_spin_is_locked(hyp_spinlock_t *lock)
+{
+       hyp_spinlock_t lockval = READ_ONCE(*lock);
+
+       return lockval.owner != lockval.next;
+}
+
+#ifdef CONFIG_NVHE_EL2_DEBUG
+static inline void hyp_assert_lock_held(hyp_spinlock_t *lock)
+{
+       /*
+        * The __pkvm_init() path accesses protected data-structures without
+        * holding locks as the other CPUs are guaranteed to not enter EL2
+        * concurrently at this point in time. The point by which EL2 is
+        * initialized on all CPUs is reflected in the pkvm static key, so
+        * wait until it is set before checking the lock state.
+        */
+       if (static_branch_likely(&kvm_protected_mode_initialized))
+               BUG_ON(!hyp_spin_is_locked(lock));
+}
+#else
+static inline void hyp_assert_lock_held(hyp_spinlock_t *lock) { }
+#endif
+
  #endif /* __ARM64_KVM_NVHE_SPINLOCK_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c

index 7d3f258..df361d8 100644 (file)
--- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
@@ -109,7 +109,7 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu)
         __debug_switch_to_host_common(vcpu);
  }
  
-u32 __kvm_get_mdcr_el2(void)
+u64 __kvm_get_mdcr_el2(void)
  {
         return read_sysreg(mdcr_el2);
  }
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S

index 2b23400..4b652ff 100644 (file)
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -7,6 +7,7 @@
  #include <linux/linkage.h>
  
  #include <asm/assembler.h>
+#include <asm/kvm_arm.h>
  #include <asm/kvm_asm.h>
  #include <asm/kvm_mmu.h>
  
@@ -85,12 +86,24 @@ SYM_FUNC_START(__hyp_do_panic)
  
         mov     x29, x0
  
+#ifdef CONFIG_NVHE_EL2_DEBUG
+       /* Ensure host stage-2 is disabled */
+       mrs     x0, hcr_el2
+       bic     x0, x0, #HCR_VM
+       msr     hcr_el2, x0
+       isb
+       tlbi    vmalls12e1
+       dsb     nsh
+#endif
+
         /* Load the panic arguments into x0-7 */
         mrs     x0, esr_el2
-       get_vcpu_ptr x4, x5
-       mrs     x5, far_el2
-       mrs     x6, hpfar_el2
-       mov     x7, xzr                 // Unused argument
+       mov     x4, x3
+       mov     x3, x2
+       hyp_pa  x3, x6
+       get_vcpu_ptr x5, x6
+       mrs     x6, far_el2
+       mrs     x7, hpfar_el2
  
         /* Enter the host, conditionally restoring the host context. */
         cbz     x29, __host_enter_without_restoring
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c

index 1632f00..2da6aa8 100644 (file)
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -140,14 +140,11 @@ static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt)
         cpu_reg(host_ctxt, 1) = pkvm_cpu_set_vector(slot);
  }
  
-static void handle___pkvm_create_mappings(struct kvm_cpu_context *host_ctxt)
+static void handle___pkvm_host_share_hyp(struct kvm_cpu_context *host_ctxt)
  {
-       DECLARE_REG(unsigned long, start, host_ctxt, 1);
-       DECLARE_REG(unsigned long, size, host_ctxt, 2);
-       DECLARE_REG(unsigned long, phys, host_ctxt, 3);
-       DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4);
+       DECLARE_REG(u64, pfn, host_ctxt, 1);
  
-       cpu_reg(host_ctxt, 1) = __pkvm_create_mappings(start, size, phys, prot);
+       cpu_reg(host_ctxt, 1) = __pkvm_host_share_hyp(pfn);
  }
  
  static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt)
@@ -163,14 +160,6 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt)
  {
         cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize();
  }
-
-static void handle___pkvm_mark_hyp(struct kvm_cpu_context *host_ctxt)
-{
-       DECLARE_REG(phys_addr_t, start, host_ctxt, 1);
-       DECLARE_REG(phys_addr_t, end, host_ctxt, 2);
-
-       cpu_reg(host_ctxt, 1) = __pkvm_mark_hyp(start, end);
-}
  typedef void (*hcall_t)(struct kvm_cpu_context *);
  
  #define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -193,10 +182,9 @@ static const hcall_t host_hcall[] = {
         HANDLE_FUNC(__vgic_v3_restore_aprs),
         HANDLE_FUNC(__pkvm_init),
         HANDLE_FUNC(__pkvm_cpu_set_vector),
-       HANDLE_FUNC(__pkvm_create_mappings),
+       HANDLE_FUNC(__pkvm_host_share_hyp),
         HANDLE_FUNC(__pkvm_create_private_mapping),
         HANDLE_FUNC(__pkvm_prot_finalize),
-       HANDLE_FUNC(__pkvm_mark_hyp),
  };
  
  static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c

index a6ce991..bacd493 100644 (file)
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -31,7 +31,7 @@ static struct hyp_pool host_s2_pool;
  u64 id_aa64mmfr0_el1_sys_val;
  u64 id_aa64mmfr1_el1_sys_val;
  
-static const u8 pkvm_hyp_id = 1;
+const u8 pkvm_hyp_id = 1;
  
  static void *host_s2_zalloc_pages_exact(size_t size)
  {
@@ -89,6 +89,8 @@ static void prepare_host_vtcr(void)
                                           id_aa64mmfr1_el1_sys_val, phys_shift);
  }
  
+static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot);
+
  int kvm_host_prepare_stage2(void *pgt_pool_base)
  {
         struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
@@ -101,16 +103,17 @@ int kvm_host_prepare_stage2(void *pgt_pool_base)
         if (ret)
                 return ret;
  
-       ret = kvm_pgtable_stage2_init_flags(&host_kvm.pgt, &host_kvm.arch,
-                                           &host_kvm.mm_ops, KVM_HOST_S2_FLAGS);
+       ret = __kvm_pgtable_stage2_init(&host_kvm.pgt, &host_kvm.arch,
+                                       &host_kvm.mm_ops, KVM_HOST_S2_FLAGS,
+                                       host_stage2_force_pte_cb);
         if (ret)
                 return ret;
  
         mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd);
         mmu->arch = &host_kvm.arch;
         mmu->pgt = &host_kvm.pgt;
-       mmu->vmid.vmid_gen = 0;
-       mmu->vmid.vmid = 0;
+       WRITE_ONCE(mmu->vmid.vmid_gen, 0);
+       WRITE_ONCE(mmu->vmid.vmid, 0);
  
         return 0;
  }
@@ -126,7 +129,7 @@ int __pkvm_prot_finalize(void)
         kvm_flush_dcache_to_poc(params, sizeof(*params));
  
         write_sysreg(params->hcr_el2, hcr_el2);
-       __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr);
+       __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch);
  
         /*
          * Make sure to have an ISB before the TLB maintenance below but only
@@ -159,6 +162,11 @@ static int host_stage2_unmap_dev_all(void)
         return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr);
  }
  
+struct kvm_mem_range {
+       u64 start;
+       u64 end;
+};
+
  static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
  {
         int cur, left = 0, right = hyp_memblock_nr;
@@ -189,16 +197,26 @@ static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
         return false;
  }
  
+bool addr_is_memory(phys_addr_t phys)
+{
+       struct kvm_mem_range range;
+
+       return find_mem_range(phys, &range);
+}
+
+static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range)
+{
+       return range->start <= addr && addr < range->end;
+}
+
  static bool range_is_memory(u64 start, u64 end)
  {
-       struct kvm_mem_range r1, r2;
+       struct kvm_mem_range r;
  
-       if (!find_mem_range(start, &r1) || !find_mem_range(end - 1, &r2))
-               return false;
-       if (r1.start != r2.start)
+       if (!find_mem_range(start, &r))
                 return false;
  
-       return true;
+       return is_in_mem_range(end - 1, &r);
  }
  
  static inline int __host_stage2_idmap(u64 start, u64 end,
@@ -208,60 +226,208 @@ static inline int __host_stage2_idmap(u64 start, u64 end,
                                       prot, &host_s2_pool);
  }
  
+/*
+ * The pool has been provided with enough pages to cover all of memory with
+ * page granularity, but it is difficult to know how much of the MMIO range
+ * we will need to cover upfront, so we may need to 'recycle' the pages if we
+ * run out.
+ */
+#define host_stage2_try(fn, ...)                                       \
+       ({                                                              \
+               int __ret;                                              \
+               hyp_assert_lock_held(&host_kvm.lock);                   \
+               __ret = fn(__VA_ARGS__);                                \
+               if (__ret == -ENOMEM) {                                 \
+                       __ret = host_stage2_unmap_dev_all();            \
+                       if (!__ret)                                     \
+                               __ret = fn(__VA_ARGS__);                \
+               }                                                       \
+               __ret;                                                  \
+        })
+
+static inline bool range_included(struct kvm_mem_range *child,
+                                 struct kvm_mem_range *parent)
+{
+       return parent->start <= child->start && child->end <= parent->end;
+}
+
+static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
+{
+       struct kvm_mem_range cur;
+       kvm_pte_t pte;
+       u32 level;
+       int ret;
+
+       hyp_assert_lock_held(&host_kvm.lock);
+       ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, &level);
+       if (ret)
+               return ret;
+
+       if (kvm_pte_valid(pte))
+               return -EAGAIN;
+
+       if (pte)
+               return -EPERM;
+
+       do {
+               u64 granule = kvm_granule_size(level);
+               cur.start = ALIGN_DOWN(addr, granule);
+               cur.end = cur.start + granule;
+               level++;
+       } while ((level < KVM_PGTABLE_MAX_LEVELS) &&
+                       !(kvm_level_supports_block_mapping(level) &&
+                         range_included(&cur, range)));
+
+       *range = cur;
+
+       return 0;
+}
+
+int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
+                            enum kvm_pgtable_prot prot)
+{
+       hyp_assert_lock_held(&host_kvm.lock);
+
+       return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot);
+}
+
+int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
+{
+       hyp_assert_lock_held(&host_kvm.lock);
+
+       return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt,
+                              addr, size, &host_s2_pool, owner_id);
+}
+
+static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot)
+{
+       /*
+        * Block mappings must be used with care in the host stage-2 as a
+        * kvm_pgtable_stage2_map() operation targeting a page in the range of
+        * an existing block will delete the block under the assumption that
+        * mappings in the rest of the block range can always be rebuilt lazily.
+        * That assumption is correct for the host stage-2 with RWX mappings
+        * targeting memory or RW mappings targeting MMIO ranges (see
+        * host_stage2_idmap() below which implements some of the host memory
+        * abort logic). However, this is not safe for any other mappings where
+        * the host stage-2 page-table is in fact the only place where this
+        * state is stored. In all those cases, it is safer to use page-level
+        * mappings, hence avoiding to lose the state because of side-effects in
+        * kvm_pgtable_stage2_map().
+        */
+       if (range_is_memory(addr, end))
+               return prot != PKVM_HOST_MEM_PROT;
+       else
+               return prot != PKVM_HOST_MMIO_PROT;
+}
+
  static int host_stage2_idmap(u64 addr)
  {
-       enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W;
         struct kvm_mem_range range;
         bool is_memory = find_mem_range(addr, &range);
+       enum kvm_pgtable_prot prot;
         int ret;
  
-       if (is_memory)
-               prot |= KVM_PGTABLE_PROT_X;
+       prot = is_memory ? PKVM_HOST_MEM_PROT : PKVM_HOST_MMIO_PROT;
  
         hyp_spin_lock(&host_kvm.lock);
-       ret = kvm_pgtable_stage2_find_range(&host_kvm.pgt, addr, prot, &range);
+       ret = host_stage2_adjust_range(addr, &range);
         if (ret)
                 goto unlock;
  
-       ret = __host_stage2_idmap(range.start, range.end, prot);
-       if (ret != -ENOMEM)
+       ret = host_stage2_idmap_locked(range.start, range.end - range.start, prot);
+unlock:
+       hyp_spin_unlock(&host_kvm.lock);
+
+       return ret;
+}
+
+static inline bool check_prot(enum kvm_pgtable_prot prot,
+                             enum kvm_pgtable_prot required,
+                             enum kvm_pgtable_prot denied)
+{
+       return (prot & (required | denied)) == required;
+}
+
+int __pkvm_host_share_hyp(u64 pfn)
+{
+       phys_addr_t addr = hyp_pfn_to_phys(pfn);
+       enum kvm_pgtable_prot prot, cur;
+       void *virt = __hyp_va(addr);
+       enum pkvm_page_state state;
+       kvm_pte_t pte;
+       int ret;
+
+       if (!addr_is_memory(addr))
+               return -EINVAL;
+
+       hyp_spin_lock(&host_kvm.lock);
+       hyp_spin_lock(&pkvm_pgd_lock);
+
+       ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, NULL);
+       if (ret)
                 goto unlock;
+       if (!pte)
+               goto map_shared;
  
         /*
-        * The pool has been provided with enough pages to cover all of memory
-        * with page granularity, but it is difficult to know how much of the
-        * MMIO range we will need to cover upfront, so we may need to 'recycle'
-        * the pages if we run out.
+        * Check attributes in the host stage-2 PTE. We need the page to be:
+        *  - mapped RWX as we're sharing memory;
+        *  - not borrowed, as that implies absence of ownership.
+        * Otherwise, we can't let it got through
          */
-       ret = host_stage2_unmap_dev_all();
-       if (ret)
+       cur = kvm_pgtable_stage2_pte_prot(pte);
+       prot = pkvm_mkstate(0, PKVM_PAGE_SHARED_BORROWED);
+       if (!check_prot(cur, PKVM_HOST_MEM_PROT, prot)) {
+               ret = -EPERM;
                 goto unlock;
+       }
  
-       ret = __host_stage2_idmap(range.start, range.end, prot);
+       state = pkvm_getstate(cur);
+       if (state == PKVM_PAGE_OWNED)
+               goto map_shared;
  
-unlock:
-       hyp_spin_unlock(&host_kvm.lock);
+       /*
+        * Tolerate double-sharing the same page, but this requires
+        * cross-checking the hypervisor stage-1.
+        */
+       if (state != PKVM_PAGE_SHARED_OWNED) {
+               ret = -EPERM;
+               goto unlock;
+       }
  
-       return ret;
-}
+       ret = kvm_pgtable_get_leaf(&pkvm_pgtable, (u64)virt, &pte, NULL);
+       if (ret)
+               goto unlock;
  
-int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end)
-{
-       int ret;
+       /*
+        * If the page has been shared with the hypervisor, it must be
+        * already mapped as SHARED_BORROWED in its stage-1.
+        */
+       cur = kvm_pgtable_hyp_pte_prot(pte);
+       prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED);
+       if (!check_prot(cur, prot, ~prot))
+               ret = -EPERM;
+       goto unlock;
  
+map_shared:
         /*
-        * host_stage2_unmap_dev_all() currently relies on MMIO mappings being
-        * non-persistent, so don't allow changing page ownership in MMIO range.
+        * If the page is not yet shared, adjust mappings in both page-tables
+        * while both locks are held.
          */
-       if (!range_is_memory(start, end))
-               return -EINVAL;
+       prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED);
+       ret = pkvm_create_mappings_locked(virt, virt + PAGE_SIZE, prot);
+       BUG_ON(ret);
  
-       hyp_spin_lock(&host_kvm.lock);
-       ret = kvm_pgtable_stage2_set_owner(&host_kvm.pgt, start, end - start,
-                                          &host_s2_pool, pkvm_hyp_id);
+       prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED);
+       ret = host_stage2_idmap_locked(addr, PAGE_SIZE, prot);
+       BUG_ON(ret);
+
+unlock:
+       hyp_spin_unlock(&pkvm_pgd_lock);
         hyp_spin_unlock(&host_kvm.lock);
  
-       return ret != -EAGAIN ? ret : 0;
+       return ret;
  }
  
  void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c

index a8efdf0..2fabece 100644 (file)
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -23,8 +23,8 @@ u64 __io_map_base;
  struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS];
  unsigned int hyp_memblock_nr;
  
-int __pkvm_create_mappings(unsigned long start, unsigned long size,
-                         unsigned long phys, enum kvm_pgtable_prot prot)
+static int __pkvm_create_mappings(unsigned long start, unsigned long size,
+                                 unsigned long phys, enum kvm_pgtable_prot prot)
  {
         int err;
  
@@ -67,13 +67,15 @@ out:
         return addr;
  }
  
-int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
+int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot)
  {
         unsigned long start = (unsigned long)from;
         unsigned long end = (unsigned long)to;
         unsigned long virt_addr;
         phys_addr_t phys;
  
+       hyp_assert_lock_held(&pkvm_pgd_lock);
+
         start = start & PAGE_MASK;
         end = PAGE_ALIGN(end);
  
@@ -81,7 +83,8 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
                 int err;
  
                 phys = hyp_virt_to_phys((void *)virt_addr);
-               err = __pkvm_create_mappings(virt_addr, PAGE_SIZE, phys, prot);
+               err = kvm_pgtable_hyp_map(&pkvm_pgtable, virt_addr, PAGE_SIZE,
+                                         phys, prot);
                 if (err)
                         return err;
         }
@@ -89,6 +92,17 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
         return 0;
  }
  
+int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
+{
+       int ret;
+
+       hyp_spin_lock(&pkvm_pgd_lock);
+       ret = pkvm_create_mappings_locked(from, to, prot);
+       hyp_spin_unlock(&pkvm_pgd_lock);
+
+       return ret;
+}
+
  int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back)
  {
         unsigned long start, end;
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c

index 0b574d1..57c2784 100644 (file)
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -58,6 +58,7 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
  {
         void *start, *end, *virt = hyp_phys_to_virt(phys);
         unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT;
+       enum kvm_pgtable_prot prot;
         int ret, i;
  
         /* Recreate the hyp page-table using the early page allocator */
@@ -83,10 +84,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
         if (ret)
                 return ret;
  
-       ret = pkvm_create_mappings(__start_rodata, __end_rodata, PAGE_HYP_RO);
-       if (ret)
-               return ret;
-
         ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO);
         if (ret)
                 return ret;
@@ -95,10 +92,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
         if (ret)
                 return ret;
  
-       ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, PAGE_HYP_RO);
-       if (ret)
-               return ret;
-
         ret = pkvm_create_mappings(virt, virt + size, PAGE_HYP);
         if (ret)
                 return ret;
@@ -117,6 +110,24 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
                         return ret;
         }
  
+       /*
+        * Map the host's .bss and .rodata sections RO in the hypervisor, but
+        * transfer the ownership from the host to the hypervisor itself to
+        * make sure it can't be donated or shared with another entity.
+        *
+        * The ownership transition requires matching changes in the host
+        * stage-2. This will be done later (see finalize_host_mappings()) once
+        * the hyp_vmemmap is addressable.
+        */
+       prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED);
+       ret = pkvm_create_mappings(__start_rodata, __end_rodata, prot);
+       if (ret)
+               return ret;
+
+       ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, prot);
+       if (ret)
+               return ret;
+
         return 0;
  }
  
@@ -148,6 +159,57 @@ static void hpool_put_page(void *addr)
         hyp_put_page(&hpool, addr);
  }
  
+static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level,
+                                        kvm_pte_t *ptep,
+                                        enum kvm_pgtable_walk_flags flag,
+                                        void * const arg)
+{
+       enum kvm_pgtable_prot prot;
+       enum pkvm_page_state state;
+       kvm_pte_t pte = *ptep;
+       phys_addr_t phys;
+
+       if (!kvm_pte_valid(pte))
+               return 0;
+
+       if (level != (KVM_PGTABLE_MAX_LEVELS - 1))
+               return -EINVAL;
+
+       phys = kvm_pte_to_phys(pte);
+       if (!addr_is_memory(phys))
+               return 0;
+
+       /*
+        * Adjust the host stage-2 mappings to match the ownership attributes
+        * configured in the hypervisor stage-1.
+        */
+       state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte));
+       switch (state) {
+       case PKVM_PAGE_OWNED:
+               return host_stage2_set_owner_locked(phys, PAGE_SIZE, pkvm_hyp_id);
+       case PKVM_PAGE_SHARED_OWNED:
+               prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED);
+               break;
+       case PKVM_PAGE_SHARED_BORROWED:
+               prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED);
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       return host_stage2_idmap_locked(phys, PAGE_SIZE, prot);
+}
+
+static int finalize_host_mappings(void)
+{
+       struct kvm_pgtable_walker walker = {
+               .cb     = finalize_host_mappings_walker,
+               .flags  = KVM_PGTABLE_WALK_LEAF,
+       };
+
+       return kvm_pgtable_walk(&pkvm_pgtable, 0, BIT(pkvm_pgtable.ia_bits), &walker);
+}
+
  void __noreturn __pkvm_init_finalise(void)
  {
         struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
@@ -167,6 +229,10 @@ void __noreturn __pkvm_init_finalise(void)
         if (ret)
                 goto out;
  
+       ret = finalize_host_mappings();
+       if (ret)
+               goto out;
+
         pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) {
                 .zalloc_page = hyp_zalloc_hyp_page,
                 .phys_to_virt = hyp_phys_to_virt,
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c

index f7af968..a34b01c 100644 (file)
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -41,7 +41,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
         ___activate_traps(vcpu);
         __activate_traps_common(vcpu);
  
-       val = CPTR_EL2_DEFAULT;
+       val = vcpu->arch.cptr_el2;
         val |= CPTR_EL2_TTA | CPTR_EL2_TAM;
         if (!update_fp_enabled(vcpu)) {
                 val |= CPTR_EL2_TFP | CPTR_EL2_TZ;
@@ -69,12 +69,10 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
  static void __deactivate_traps(struct kvm_vcpu *vcpu)
  {
         extern char __kvm_hyp_host_vector[];
-       u64 mdcr_el2, cptr;
+       u64 cptr;
  
         ___deactivate_traps(vcpu);
  
-       mdcr_el2 = read_sysreg(mdcr_el2);
-
         if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
                 u64 val;
  
@@ -92,13 +90,8 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
                 isb();
         }
  
-       __deactivate_traps_common();
-
-       mdcr_el2 &= MDCR_EL2_HPMN_MASK;
-       mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT;
-       mdcr_el2 |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT;
+       __deactivate_traps_common(vcpu);
  
-       write_sysreg(mdcr_el2, mdcr_el2);
         write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2);
  
         cptr = CPTR_EL2_DEFAULT;
@@ -170,6 +163,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
  {
         struct kvm_cpu_context *host_ctxt;
         struct kvm_cpu_context *guest_ctxt;
+       struct kvm_s2_mmu *mmu;
         bool pmu_switch_needed;
         u64 exit_code;
  
@@ -213,7 +207,8 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
         __sysreg32_restore_state(vcpu);
         __sysreg_restore_state_nvhe(guest_ctxt);
  
-       __load_guest_stage2(kern_hyp_va(vcpu->arch.hw_mmu));
+       mmu = kern_hyp_va(vcpu->arch.hw_mmu);
+       __load_stage2(mmu, kern_hyp_va(mmu->arch));
         __activate_traps(vcpu);
  
         __hyp_vgic_restore_state(vcpu);
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c

index 38ed0f6..d296d61 100644 (file)
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -34,12 +34,12 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
         }
  
         /*
-        * __load_guest_stage2() includes an ISB only when the AT
+        * __load_stage2() includes an ISB only when the AT
          * workaround is applied. Take care of the opposite condition,
          * ensuring that we always have an ISB, but not two ISBs back
          * to back.
          */
-       __load_guest_stage2(mmu);
+       __load_stage2(mmu, kern_hyp_va(mmu->arch));
         asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT));
  }
  
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c

index 05321f4..f8ceebe 100644 (file)
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -11,16 +11,12 @@
  #include <asm/kvm_pgtable.h>
  #include <asm/stage2_pgtable.h>
  
-#define KVM_PTE_VALID                  BIT(0)
  
  #define KVM_PTE_TYPE                   BIT(1)
  #define KVM_PTE_TYPE_BLOCK             0
  #define KVM_PTE_TYPE_PAGE              1
  #define KVM_PTE_TYPE_TABLE             1
  
-#define KVM_PTE_ADDR_MASK              GENMASK(47, PAGE_SHIFT)
-#define KVM_PTE_ADDR_51_48             GENMASK(15, 12)
-
  #define KVM_PTE_LEAF_ATTR_LO           GENMASK(11, 2)
  
  #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX        GENMASK(4, 2)
@@ -40,6 +36,8 @@
  
  #define KVM_PTE_LEAF_ATTR_HI           GENMASK(63, 51)
  
+#define KVM_PTE_LEAF_ATTR_HI_SW                GENMASK(58, 55)
+
  #define KVM_PTE_LEAF_ATTR_HI_S1_XN     BIT(54)
  
  #define KVM_PTE_LEAF_ATTR_HI_S2_XN     BIT(54)
@@ -48,9 +46,7 @@
                                          KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
                                          KVM_PTE_LEAF_ATTR_HI_S2_XN)
  
-#define KVM_PTE_LEAF_ATTR_S2_IGNORED   GENMASK(58, 55)
-
-#define KVM_INVALID_PTE_OWNER_MASK     GENMASK(63, 56)
+#define KVM_INVALID_PTE_OWNER_MASK     GENMASK(9, 2)
  #define KVM_MAX_OWNER_ID               1
  
  struct kvm_pgtable_walk_data {
@@ -61,17 +57,6 @@ struct kvm_pgtable_walk_data {
         u64                             end;
  };
  
-static u64 kvm_granule_shift(u32 level)
-{
-       /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
-       return ARM64_HW_PGTABLE_LEVEL_SHIFT(level);
-}
-
-static u64 kvm_granule_size(u32 level)
-{
-       return BIT(kvm_granule_shift(level));
-}
-
  #define KVM_PHYS_INVALID (-1ULL)
  
  static bool kvm_phys_is_valid(u64 phys)
@@ -79,15 +64,6 @@ static bool kvm_phys_is_valid(u64 phys)
         return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_PARANGE_MAX));
  }
  
-static bool kvm_level_supports_block_mapping(u32 level)
-{
-       /*
-        * Reject invalid block mappings and don't bother with 4TB mappings for
-        * 52-bit PAs.
-        */
-       return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1));
-}
-
  static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
  {
         u64 granule = kvm_granule_size(level);
@@ -135,11 +111,6 @@ static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
         return __kvm_pgd_page_idx(&pgt, -1ULL) + 1;
  }
  
-static bool kvm_pte_valid(kvm_pte_t pte)
-{
-       return pte & KVM_PTE_VALID;
-}
-
  static bool kvm_pte_table(kvm_pte_t pte, u32 level)
  {
         if (level == KVM_PGTABLE_MAX_LEVELS - 1)
@@ -151,16 +122,6 @@ static bool kvm_pte_table(kvm_pte_t pte, u32 level)
         return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
  }
  
-static u64 kvm_pte_to_phys(kvm_pte_t pte)
-{
-       u64 pa = pte & KVM_PTE_ADDR_MASK;
-
-       if (PAGE_SHIFT == 16)
-               pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
-
-       return pa;
-}
-
  static kvm_pte_t kvm_phys_to_pte(u64 pa)
  {
         kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
@@ -326,6 +287,45 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
         return _kvm_pgtable_walk(&walk_data);
  }
  
+struct leaf_walk_data {
+       kvm_pte_t       pte;
+       u32             level;
+};
+
+static int leaf_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+                      enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+       struct leaf_walk_data *data = arg;
+
+       data->pte   = *ptep;
+       data->level = level;
+
+       return 0;
+}
+
+int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
+                        kvm_pte_t *ptep, u32 *level)
+{
+       struct leaf_walk_data data;
+       struct kvm_pgtable_walker walker = {
+               .cb     = leaf_walker,
+               .flags  = KVM_PGTABLE_WALK_LEAF,
+               .arg    = &data,
+       };
+       int ret;
+
+       ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE),
+                              PAGE_SIZE, &walker);
+       if (!ret) {
+               if (ptep)
+                       *ptep  = data.pte;
+               if (level)
+                       *level = data.level;
+       }
+
+       return ret;
+}
+
  struct hyp_map_data {
         u64                             phys;
         kvm_pte_t                       attr;
@@ -357,11 +357,47 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
         attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
         attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
         attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
+       attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
         *ptep = attr;
  
         return 0;
  }
  
+enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte)
+{
+       enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW;
+       u32 ap;
+
+       if (!kvm_pte_valid(pte))
+               return prot;
+
+       if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN))
+               prot |= KVM_PGTABLE_PROT_X;
+
+       ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte);
+       if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO)
+               prot |= KVM_PGTABLE_PROT_R;
+       else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW)
+               prot |= KVM_PGTABLE_PROT_RW;
+
+       return prot;
+}
+
+static bool hyp_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
+{
+       /*
+        * Tolerate KVM recreating the exact same mapping, or changing software
+        * bits if the existing mapping was valid.
+        */
+       if (old == new)
+               return false;
+
+       if (!kvm_pte_valid(old))
+               return true;
+
+       return !WARN_ON((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW);
+}
+
  static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
                                     kvm_pte_t *ptep, struct hyp_map_data *data)
  {
@@ -371,9 +407,8 @@ static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
         if (!kvm_block_mapping_supported(addr, end, phys, level))
                 return false;
  
-       /* Tolerate KVM recreating the exact same mapping */
         new = kvm_init_valid_leaf_pte(phys, data->attr, level);
-       if (old != new && !WARN_ON(kvm_pte_valid(old)))
+       if (hyp_pte_needs_update(old, new))
                 smp_store_release(ptep, new);
  
         data->phys += granule;
@@ -438,6 +473,8 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
         pgt->start_level        = KVM_PGTABLE_MAX_LEVELS - levels;
         pgt->mm_ops             = mm_ops;
         pgt->mmu                = NULL;
+       pgt->force_pte_cb       = NULL;
+
         return 0;
  }
  
@@ -475,6 +512,9 @@ struct stage2_map_data {
         void                            *memcache;
  
         struct kvm_pgtable_mm_ops       *mm_ops;
+
+       /* Force mappings to page granularity */
+       bool                            force_pte;
  };
  
  u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
@@ -539,11 +579,29 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p
  
         attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
         attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
+       attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
         *ptep = attr;
  
         return 0;
  }
  
+enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte)
+{
+       enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW;
+
+       if (!kvm_pte_valid(pte))
+               return prot;
+
+       if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R)
+               prot |= KVM_PGTABLE_PROT_R;
+       if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W)
+               prot |= KVM_PGTABLE_PROT_W;
+       if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN))
+               prot |= KVM_PGTABLE_PROT_X;
+
+       return prot;
+}
+
  static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
  {
         if (!kvm_pte_valid(old) || !kvm_pte_valid(new))
@@ -588,6 +646,15 @@ static bool stage2_pte_executable(kvm_pte_t pte)
         return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
  }
  
+static bool stage2_leaf_mapping_allowed(u64 addr, u64 end, u32 level,
+                                       struct stage2_map_data *data)
+{
+       if (data->force_pte && (level < (KVM_PGTABLE_MAX_LEVELS - 1)))
+               return false;
+
+       return kvm_block_mapping_supported(addr, end, data->phys, level);
+}
+
  static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
                                       kvm_pte_t *ptep,
                                       struct stage2_map_data *data)
@@ -597,7 +664,7 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
         struct kvm_pgtable *pgt = data->mmu->pgt;
         struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
  
-       if (!kvm_block_mapping_supported(addr, end, phys, level))
+       if (!stage2_leaf_mapping_allowed(addr, end, level, data))
                 return -E2BIG;
  
         if (kvm_phys_is_valid(phys))
@@ -641,7 +708,7 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
         if (data->anchor)
                 return 0;
  
-       if (!kvm_block_mapping_supported(addr, end, data->phys, level))
+       if (!stage2_leaf_mapping_allowed(addr, end, level, data))
                 return 0;
  
         data->childp = kvm_pte_follow(*ptep, data->mm_ops);
@@ -771,6 +838,7 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
                 .mmu            = pgt->mmu,
                 .memcache       = mc,
                 .mm_ops         = pgt->mm_ops,
+               .force_pte      = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot),
         };
         struct kvm_pgtable_walker walker = {
                 .cb             = stage2_map_walker,
@@ -802,6 +870,7 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
                 .memcache       = mc,
                 .mm_ops         = pgt->mm_ops,
                 .owner_id       = owner_id,
+               .force_pte      = true,
         };
         struct kvm_pgtable_walker walker = {
                 .cb             = stage2_map_walker,
@@ -995,6 +1064,9 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
         u32 level;
         kvm_pte_t set = 0, clr = 0;
  
+       if (prot & KVM_PTE_LEAF_ATTR_HI_SW)
+               return -EINVAL;
+
         if (prot & KVM_PGTABLE_PROT_R)
                 set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
  
@@ -1043,9 +1115,11 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
         return kvm_pgtable_walk(pgt, addr, size, &walker);
  }
  
-int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch,
-                                 struct kvm_pgtable_mm_ops *mm_ops,
-                                 enum kvm_pgtable_stage2_flags flags)
+
+int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_arch *arch,
+                             struct kvm_pgtable_mm_ops *mm_ops,
+                             enum kvm_pgtable_stage2_flags flags,
+                             kvm_pgtable_force_pte_cb_t force_pte_cb)
  {
         size_t pgd_sz;
         u64 vtcr = arch->vtcr;
@@ -1063,6 +1137,7 @@ int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch
         pgt->mm_ops             = mm_ops;
         pgt->mmu                = &arch->mmu;
         pgt->flags              = flags;
+       pgt->force_pte_cb       = force_pte_cb;
  
         /* Ensure zeroed PGD pages are visible to the hardware walker */
         dsb(ishst);
@@ -1102,77 +1177,3 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
         pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz);
         pgt->pgd = NULL;
  }
-
-#define KVM_PTE_LEAF_S2_COMPAT_MASK    (KVM_PTE_LEAF_ATTR_S2_PERMS | \
-                                        KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR | \
-                                        KVM_PTE_LEAF_ATTR_S2_IGNORED)
-
-static int stage2_check_permission_walker(u64 addr, u64 end, u32 level,
-                                         kvm_pte_t *ptep,
-                                         enum kvm_pgtable_walk_flags flag,
-                                         void * const arg)
-{
-       kvm_pte_t old_attr, pte = *ptep, *new_attr = arg;
-
-       /*
-        * Compatible mappings are either invalid and owned by the page-table
-        * owner (whose id is 0), or valid with matching permission attributes.
-        */
-       if (kvm_pte_valid(pte)) {
-               old_attr = pte & KVM_PTE_LEAF_S2_COMPAT_MASK;
-               if (old_attr != *new_attr)
-                       return -EEXIST;
-       } else if (pte) {
-               return -EEXIST;
-       }
-
-       return 0;
-}
-
-int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr,
-                                 enum kvm_pgtable_prot prot,
-                                 struct kvm_mem_range *range)
-{
-       kvm_pte_t attr;
-       struct kvm_pgtable_walker check_perm_walker = {
-               .cb             = stage2_check_permission_walker,
-               .flags          = KVM_PGTABLE_WALK_LEAF,
-               .arg            = &attr,
-       };
-       u64 granule, start, end;
-       u32 level;
-       int ret;
-
-       ret = stage2_set_prot_attr(pgt, prot, &attr);
-       if (ret)
-               return ret;
-       attr &= KVM_PTE_LEAF_S2_COMPAT_MASK;
-
-       for (level = pgt->start_level; level < KVM_PGTABLE_MAX_LEVELS; level++) {
-               granule = kvm_granule_size(level);
-               start = ALIGN_DOWN(addr, granule);
-               end = start + granule;
-
-               if (!kvm_level_supports_block_mapping(level))
-                       continue;
-
-               if (start < range->start || range->end < end)
-                       continue;
-
-               /*
-                * Check the presence of existing mappings with incompatible
-                * permissions within the current block range, and try one level
-                * deeper if one is found.
-                */
-               ret = kvm_pgtable_walk(pgt, start, granule, &check_perm_walker);
-               if (ret != -EEXIST)
-                       break;
-       }
-
-       if (!ret) {
-               range->start = start;
-               range->end = end;
-       }
-
-       return ret;
-}
diff --git a/arch/arm64/kvm/hyp/vhe/debug-sr.c b/arch/arm64/kvm/hyp/vhe/debug-sr.c

index f1e2e5a..289689b 100644 (file)
--- a/arch/arm64/kvm/hyp/vhe/debug-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/debug-sr.c
@@ -20,7 +20,7 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu)
         __debug_switch_to_host_common(vcpu);
  }
  
-u32 __kvm_get_mdcr_el2(void)
+u64 __kvm_get_mdcr_el2(void)
  {
         return read_sysreg(mdcr_el2);
  }
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c

index b322992..ded2c66 100644 (file)
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -91,17 +91,9 @@ void activate_traps_vhe_load(struct kvm_vcpu *vcpu)
         __activate_traps_common(vcpu);
  }
  
-void deactivate_traps_vhe_put(void)
+void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu)
  {
-       u64 mdcr_el2 = read_sysreg(mdcr_el2);
-
-       mdcr_el2 &= MDCR_EL2_HPMN_MASK |
-                   MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT |
-                   MDCR_EL2_TPMS;
-
-       write_sysreg(mdcr_el2, mdcr_el2);
-
-       __deactivate_traps_common();
+       __deactivate_traps_common(vcpu);
  }
  
  /* Switch to the guest for VHE systems running in EL2 */
@@ -124,11 +116,11 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
          *
          * We have already configured the guest's stage 1 translation in
          * kvm_vcpu_load_sysregs_vhe above.  We must now call
-        * __load_guest_stage2 before __activate_traps, because
-        * __load_guest_stage2 configures stage 2 translation, and
+        * __load_stage2 before __activate_traps, because
+        * __load_stage2 configures stage 2 translation, and
          * __activate_traps clear HCR_EL2.TGE (among other things).
          */
-       __load_guest_stage2(vcpu->arch.hw_mmu);
+       __load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch);
         __activate_traps(vcpu);
  
         __kvm_adjust_pc(vcpu);
diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c

index 2a0b8c8..007a12d 100644 (file)
--- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
@@ -101,7 +101,7 @@ void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu)
         struct kvm_cpu_context *host_ctxt;
  
         host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
-       deactivate_traps_vhe_put();
+       deactivate_traps_vhe_put(vcpu);
  
         __sysreg_save_el1_state(guest_ctxt);
         __sysreg_save_user_state(guest_ctxt);
diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c

index 66f1734..24cef9b 100644 (file)
--- a/arch/arm64/kvm/hyp/vhe/tlb.c
+++ b/arch/arm64/kvm/hyp/vhe/tlb.c
@@ -50,10 +50,10 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
          *
          * ARM erratum 1165522 requires some special handling (again),
          * as we need to make sure both stages of translation are in
-        * place before clearing TGE. __load_guest_stage2() already
+        * place before clearing TGE. __load_stage2() already
          * has an ISB in order to deal with this.
          */
-       __load_guest_stage2(mmu);
+       __load_stage2(mmu, mmu->arch);
         val = read_sysreg(hcr_el2);
         val &= ~HCR_TGE;
         write_sysreg(val, hcr_el2);
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c

index 0625bf2..1a94a7c 100644 (file)
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -80,6 +80,7 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot)
   */
  void kvm_flush_remote_tlbs(struct kvm *kvm)
  {
+       ++kvm->stat.generic.remote_tlb_flush_requests;
         kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
  }
  
@@ -259,10 +260,8 @@ static int __create_hyp_mappings(unsigned long start, unsigned long size,
  {
         int err;
  
-       if (!kvm_host_owns_hyp_mappings()) {
-               return kvm_call_hyp_nvhe(__pkvm_create_mappings,
-                                        start, size, phys, prot);
-       }
+       if (WARN_ON(!kvm_host_owns_hyp_mappings()))
+               return -EINVAL;
  
         mutex_lock(&kvm_hyp_pgd_mutex);
         err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
@@ -282,6 +281,21 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
         }
  }
  
+static int pkvm_share_hyp(phys_addr_t start, phys_addr_t end)
+{
+       phys_addr_t addr;
+       int ret;
+
+       for (addr = ALIGN_DOWN(start, PAGE_SIZE); addr < end; addr += PAGE_SIZE) {
+               ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp,
+                                       __phys_to_pfn(addr));
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
  /**
   * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
   * @from:      The virtual kernel start address of the range
@@ -302,6 +316,13 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
         if (is_kernel_in_hyp_mode())
                 return 0;
  
+       if (!kvm_host_owns_hyp_mappings()) {
+               if (WARN_ON(prot != PAGE_HYP))
+                       return -EPERM;
+               return pkvm_share_hyp(kvm_kaddr_to_phys(from),
+                                     kvm_kaddr_to_phys(to));
+       }
+
         start = start & PAGE_MASK;
         end = PAGE_ALIGN(end);
  
@@ -433,6 +454,32 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
         return 0;
  }
  
+static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
+       /* We shouldn't need any other callback to walk the PT */
+       .phys_to_virt           = kvm_host_va,
+};
+
+static int get_user_mapping_size(struct kvm *kvm, u64 addr)
+{
+       struct kvm_pgtable pgt = {
+               .pgd            = (kvm_pte_t *)kvm->mm->pgd,
+               .ia_bits        = VA_BITS,
+               .start_level    = (KVM_PGTABLE_MAX_LEVELS -
+                                  CONFIG_PGTABLE_LEVELS),
+               .mm_ops         = &kvm_user_mm_ops,
+       };
+       kvm_pte_t pte = 0;      /* Keep GCC quiet... */
+       u32 level = ~0;
+       int ret;
+
+       ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level);
+       VM_BUG_ON(ret);
+       VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS);
+       VM_BUG_ON(!(pte & PTE_VALID));
+
+       return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level));
+}
+
  static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
         .zalloc_page            = stage2_memcache_zalloc_page,
         .zalloc_pages_exact     = kvm_host_zalloc_pages_exact,
@@ -485,7 +532,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
         mmu->arch = &kvm->arch;
         mmu->pgt = pgt;
         mmu->pgd_phys = __pa(pgt->pgd);
-       mmu->vmid.vmid_gen = 0;
+       WRITE_ONCE(mmu->vmid.vmid_gen, 0);
         return 0;
  
  out_destroy_pgtable:
@@ -780,7 +827,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
   * Returns the size of the mapping.
   */
  static unsigned long
-transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
+transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
                             unsigned long hva, kvm_pfn_t *pfnp,
                             phys_addr_t *ipap)
  {
@@ -791,8 +838,8 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
          * sure that the HVA and IPA are sufficiently aligned and that the
          * block map is contained within the memslot.
          */
-       if (kvm_is_transparent_hugepage(pfn) &&
-           fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
+       if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
+           get_user_mapping_size(kvm, hva) >= PMD_SIZE) {
                 /*
                  * The address we faulted on is backed by a transparent huge
                  * page.  However, because we map the compound huge page and
@@ -814,7 +861,7 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
                 *ipap &= PMD_MASK;
                 kvm_release_pfn_clean(pfn);
                 pfn &= ~(PTRS_PER_PMD - 1);
-               kvm_get_pfn(pfn);
+               get_page(pfn_to_page(pfn));
                 *pfnp = pfn;
  
                 return PMD_SIZE;
@@ -1050,9 +1097,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
          * If we are not forced to use page mapping, check if we are
          * backed by a THP and thus use block mapping if possible.
          */
-       if (vma_pagesize == PAGE_SIZE && !(force_pte || device))
-               vma_pagesize = transparent_hugepage_adjust(memslot, hva,
-                                                          &pfn, &fault_ipa);
+       if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
+               if (fault_status == FSC_PERM && fault_granule > PAGE_SIZE)
+                       vma_pagesize = fault_granule;
+               else
+                       vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
+                                                                  hva, &pfn,
+                                                                  &fault_ipa);
+       }
  
         if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {
                 /* Check the VMM hasn't introduced a new VM_SHARED VMA */
diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c

index 151c31f..f9bb3b1 100644 (file)
--- a/arch/arm64/kvm/perf.c
+++ b/arch/arm64/kvm/perf.c
@@ -50,7 +50,7 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
  
  int kvm_perf_init(void)
  {
-       if (kvm_pmu_probe_pmuver() != 0xf && !is_protected_kvm_enabled())
+       if (kvm_pmu_probe_pmuver() != ID_AA64DFR0_PMUVER_IMP_DEF && !is_protected_kvm_enabled())
                 static_branch_enable(&kvm_arm_pmu_available);
  
         return perf_register_guest_info_callbacks(&kvm_guest_cbs);
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c

index f33825c..f5065f2 100644 (file)
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -373,7 +373,6 @@ static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
                 reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
                 reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
                 reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
-               reg &= kvm_pmu_valid_counter_mask(vcpu);
         }
  
         return reg;
@@ -564,20 +563,21 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
   */
  void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
  {
-       unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
         int i;
  
         if (val & ARMV8_PMU_PMCR_E) {
                 kvm_pmu_enable_counter_mask(vcpu,
-                      __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask);
+                      __vcpu_sys_reg(vcpu, PMCNTENSET_EL0));
         } else {
-               kvm_pmu_disable_counter_mask(vcpu, mask);
+               kvm_pmu_disable_counter_mask(vcpu,
+                      __vcpu_sys_reg(vcpu, PMCNTENSET_EL0));
         }
  
         if (val & ARMV8_PMU_PMCR_C)
                 kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0);
  
         if (val & ARMV8_PMU_PMCR_P) {
+               unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
                 mask &= ~BIT(ARMV8_PMU_CYCLE_IDX);
                 for_each_set_bit(i, &mask, 32)
                         kvm_pmu_set_counter_value(vcpu, i, 0);
@@ -745,7 +745,7 @@ int kvm_pmu_probe_pmuver(void)
         struct perf_event_attr attr = { };
         struct perf_event *event;
         struct arm_pmu *pmu;
-       int pmuver = 0xf;
+       int pmuver = ID_AA64DFR0_PMUVER_IMP_DEF;
  
         /*
          * Create a dummy event that only counts user cycles. As we'll never
@@ -770,7 +770,7 @@ int kvm_pmu_probe_pmuver(void)
         if (IS_ERR(event)) {
                 pr_err_once("kvm: pmu event creation failed %ld\n",
                             PTR_ERR(event));
-               return 0xf;
+               return ID_AA64DFR0_PMUVER_IMP_DEF;
         }
  
         if (event->pmu) {
@@ -923,7 +923,7 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
         if (!vcpu->kvm->arch.pmuver)
                 vcpu->kvm->arch.pmuver = kvm_pmu_probe_pmuver();
  
-       if (vcpu->kvm->arch.pmuver == 0xf)
+       if (vcpu->kvm->arch.pmuver == ID_AA64DFR0_PMUVER_IMP_DEF)
                 return -ENODEV;
  
         switch (attr->attr) {
diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c

index db4056e..74c47d4 100644 (file)
--- a/arch/arm64/kvm/psci.c
+++ b/arch/arm64/kvm/psci.c
@@ -59,6 +59,12 @@ static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
         kvm_vcpu_kick(vcpu);
  }
  
+static inline bool kvm_psci_valid_affinity(struct kvm_vcpu *vcpu,
+                                          unsigned long affinity)
+{
+       return !(affinity & ~MPIDR_HWID_BITMASK);
+}
+
  static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
  {
         struct vcpu_reset_state *reset_state;
@@ -66,9 +72,9 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
         struct kvm_vcpu *vcpu = NULL;
         unsigned long cpu_id;
  
-       cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK;
-       if (vcpu_mode_is_32bit(source_vcpu))
-               cpu_id &= ~((u32) 0);
+       cpu_id = smccc_get_arg1(source_vcpu);
+       if (!kvm_psci_valid_affinity(source_vcpu, cpu_id))
+               return PSCI_RET_INVALID_PARAMS;
  
         vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id);
  
@@ -126,6 +132,9 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
         target_affinity = smccc_get_arg1(vcpu);
         lowest_affinity_level = smccc_get_arg2(vcpu);
  
+       if (!kvm_psci_valid_affinity(vcpu, target_affinity))
+               return PSCI_RET_INVALID_PARAMS;
+
         /* Determine target affinity mask */
         target_affinity_mask = psci_affinity_mask(lowest_affinity_level);
         if (!target_affinity_mask)
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c

index cba7872..5ce36b0 100644 (file)
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -210,10 +210,16 @@ static bool vcpu_allowed_register_width(struct kvm_vcpu *vcpu)
   */
  int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
  {
+       struct vcpu_reset_state reset_state;
         int ret;
         bool loaded;
         u32 pstate;
  
+       mutex_lock(&vcpu->kvm->lock);
+       reset_state = vcpu->arch.reset_state;
+       WRITE_ONCE(vcpu->arch.reset_state.reset, false);
+       mutex_unlock(&vcpu->kvm->lock);
+
         /* Reset PMU outside of the non-preemptible section */
         kvm_pmu_vcpu_reset(vcpu);
  
@@ -276,8 +282,8 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
          * Additional reset state handling that PSCI may have imposed on us.
          * Must be done after all the sys_reg reset.
          */
-       if (vcpu->arch.reset_state.reset) {
-               unsigned long target_pc = vcpu->arch.reset_state.pc;
+       if (reset_state.reset) {
+               unsigned long target_pc = reset_state.pc;
  
                 /* Gracefully handle Thumb2 entry point */
                 if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) {
@@ -286,13 +292,11 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
                 }
  
                 /* Propagate caller endianness */
-               if (vcpu->arch.reset_state.be)
+               if (reset_state.be)
                         kvm_vcpu_set_be(vcpu);
  
                 *vcpu_pc(vcpu) = target_pc;
-               vcpu_set_reg(vcpu, 0, vcpu->arch.reset_state.r0);
-
-               vcpu->arch.reset_state.reset = false;
+               vcpu_set_reg(vcpu, 0, reset_state.r0);
         }
  
         /* Reset timer */
@@ -311,31 +315,26 @@ u32 get_kvm_ipa_limit(void)
  
  int kvm_set_ipa_limit(void)
  {
-       unsigned int parange, tgran_2;
+       unsigned int parange;
         u64 mmfr0;
  
         mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
         parange = cpuid_feature_extract_unsigned_field(mmfr0,
                                 ID_AA64MMFR0_PARANGE_SHIFT);
+       /*
+        * IPA size beyond 48 bits could not be supported
+        * on either 4K or 16K page size. Hence let's cap
+        * it to 48 bits, in case it's reported as larger
+        * on the system.
+        */
+       if (PAGE_SIZE != SZ_64K)
+               parange = min(parange, (unsigned int)ID_AA64MMFR0_PARANGE_48);
  
         /*
          * Check with ARMv8.5-GTG that our PAGE_SIZE is supported at
          * Stage-2. If not, things will stop very quickly.
          */
-       switch (PAGE_SIZE) {
-       default:
-       case SZ_4K:
-               tgran_2 = ID_AA64MMFR0_TGRAN4_2_SHIFT;
-               break;
-       case SZ_16K:
-               tgran_2 = ID_AA64MMFR0_TGRAN16_2_SHIFT;
-               break;
-       case SZ_64K:
-               tgran_2 = ID_AA64MMFR0_TGRAN64_2_SHIFT;
-               break;
-       }
-
-       switch (cpuid_feature_extract_unsigned_field(mmfr0, tgran_2)) {
+       switch (cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_TGRAN_2_SHIFT)) {
         case ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE:
                 kvm_err("PAGE_SIZE not supported at Stage-2, giving up\n");
                 return -EINVAL;
@@ -369,7 +368,7 @@ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
         phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
         if (phys_shift) {
                 if (phys_shift > kvm_ipa_limit ||
-                   phys_shift < 32)
+                   phys_shift < ARM64_MIN_PARANGE_BITS)
                         return -EINVAL;
         } else {
                 phys_shift = KVM_PHYS_SHIFT;
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c

index f6f126e..1d46e18 100644 (file)
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -44,10 +44,6 @@
   * 64bit interface.
   */
  
-#define reg_to_encoding(x)                                             \
-       sys_reg((u32)(x)->Op0, (u32)(x)->Op1,                           \
-               (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2)
-
  static bool read_from_write_only(struct kvm_vcpu *vcpu,
                                  struct sys_reg_params *params,
                                  const struct sys_reg_desc *r)
@@ -318,14 +314,14 @@ static bool trap_dbgauthstatus_el1(struct kvm_vcpu *vcpu,
  /*
   * We want to avoid world-switching all the DBG registers all the
   * time:
- * 
+ *
   * - If we've touched any debug register, it is likely that we're
   *   going to touch more of them. It then makes sense to disable the
   *   traps and start doing the save/restore dance
   * - If debug is active (DBG_MDSCR_KDE or DBG_MDSCR_MDE set), it is
   *   then mandatory to save/restore the registers, as the guest
   *   depends on them.
- * 
+ *
   * For this, we use a DIRTY bit, indicating the guest has modified the
   * debug registers, used as follow:
   *
@@ -603,6 +599,41 @@ static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu,
         return REG_HIDDEN;
  }
  
+static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+       u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX);
+
+       /* No PMU available, any PMU reg may UNDEF... */
+       if (!kvm_arm_support_pmu_v3())
+               return;
+
+       n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT;
+       n &= ARMV8_PMU_PMCR_N_MASK;
+       if (n)
+               mask |= GENMASK(n - 1, 0);
+
+       reset_unknown(vcpu, r);
+       __vcpu_sys_reg(vcpu, r->reg) &= mask;
+}
+
+static void reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+       reset_unknown(vcpu, r);
+       __vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0);
+}
+
+static void reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+       reset_unknown(vcpu, r);
+       __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_EVTYPE_MASK;
+}
+
+static void reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+       reset_unknown(vcpu, r);
+       __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK;
+}
+
  static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
  {
         u64 pmcr, val;
@@ -845,7 +876,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                         kvm_pmu_disable_counter_mask(vcpu, val);
                 }
         } else {
-               p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask;
+               p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
         }
  
         return true;
@@ -869,7 +900,7 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                         /* accessing PMINTENCLR_EL1 */
                         __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val;
         } else {
-               p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1) & mask;
+               p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
         }
  
         return true;
@@ -891,7 +922,7 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                         /* accessing PMOVSCLR_EL0 */
                         __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask);
         } else {
-               p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0) & mask;
+               p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
         }
  
         return true;
@@ -944,16 +975,18 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
           trap_wcr, reset_wcr, 0, 0,  get_wcr, set_wcr }
  
  #define PMU_SYS_REG(r)                                         \
-       SYS_DESC(r), .reset = reset_unknown, .visibility = pmu_visibility
+       SYS_DESC(r), .reset = reset_pmu_reg, .visibility = pmu_visibility
  
  /* Macro to expand the PMEVCNTRn_EL0 register */
  #define PMU_PMEVCNTR_EL0(n)                                            \
         { PMU_SYS_REG(SYS_PMEVCNTRn_EL0(n)),                            \
+         .reset = reset_pmevcntr,                                      \
           .access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), }
  
  /* Macro to expand the PMEVTYPERn_EL0 register */
  #define PMU_PMEVTYPER_EL0(n)                                           \
         { PMU_SYS_REG(SYS_PMEVTYPERn_EL0(n)),                           \
+         .reset = reset_pmevtyper,                                     \
           .access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), }
  
  static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
@@ -1026,8 +1059,6 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
         return true;
  }
  
-#define FEATURE(x)     (GENMASK_ULL(x##_SHIFT + 3, x##_SHIFT))
-
  /* Read a sanitised cpufeature ID register by sys_reg_desc */
  static u64 read_id_reg(const struct kvm_vcpu *vcpu,
                 struct sys_reg_desc const *r, bool raz)
@@ -1038,40 +1069,40 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
         switch (id) {
         case SYS_ID_AA64PFR0_EL1:
                 if (!vcpu_has_sve(vcpu))
-                       val &= ~FEATURE(ID_AA64PFR0_SVE);
-               val &= ~FEATURE(ID_AA64PFR0_AMU);
-               val &= ~FEATURE(ID_AA64PFR0_CSV2);
-               val |= FIELD_PREP(FEATURE(ID_AA64PFR0_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2);
-               val &= ~FEATURE(ID_AA64PFR0_CSV3);
-               val |= FIELD_PREP(FEATURE(ID_AA64PFR0_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3);
+                       val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_SVE);
+               val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_AMU);
+               val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2);
+               val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2);
+               val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3);
+               val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3);
                 break;
         case SYS_ID_AA64PFR1_EL1:
-               val &= ~FEATURE(ID_AA64PFR1_MTE);
+               val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_MTE);
                 if (kvm_has_mte(vcpu->kvm)) {
                         u64 pfr, mte;
  
                         pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
                         mte = cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR1_MTE_SHIFT);
-                       val |= FIELD_PREP(FEATURE(ID_AA64PFR1_MTE), mte);
+                       val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR1_MTE), mte);
                 }
                 break;
         case SYS_ID_AA64ISAR1_EL1:
                 if (!vcpu_has_ptrauth(vcpu))
-                       val &= ~(FEATURE(ID_AA64ISAR1_APA) |
-                                FEATURE(ID_AA64ISAR1_API) |
-                                FEATURE(ID_AA64ISAR1_GPA) |
-                                FEATURE(ID_AA64ISAR1_GPI));
+                       val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_APA) |
+                                ARM64_FEATURE_MASK(ID_AA64ISAR1_API) |
+                                ARM64_FEATURE_MASK(ID_AA64ISAR1_GPA) |
+                                ARM64_FEATURE_MASK(ID_AA64ISAR1_GPI));
                 break;
         case SYS_ID_AA64DFR0_EL1:
                 /* Limit debug to ARMv8.0 */
-               val &= ~FEATURE(ID_AA64DFR0_DEBUGVER);
-               val |= FIELD_PREP(FEATURE(ID_AA64DFR0_DEBUGVER), 6);
+               val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER);
+               val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER), 6);
                 /* Limit guests to PMUv3 for ARMv8.4 */
                 val = cpuid_feature_cap_perfmon_field(val,
                                                       ID_AA64DFR0_PMUVER_SHIFT,
                                                       kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_PMUVER_8_4 : 0);
                 /* Hide SPE from guests */
-               val &= ~FEATURE(ID_AA64DFR0_PMSVER);
+               val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_PMSVER);
                 break;
         case SYS_ID_DFR0_EL1:
                 /* Limit guests to PMUv3 for ARMv8.4 */
@@ -1249,6 +1280,20 @@ static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
         return __set_id_reg(vcpu, rd, uaddr, true);
  }
  
+static int set_wi_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+                     const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       int err;
+       u64 val;
+
+       /* Perform the access even if we are going to ignore the value */
+       err = reg_from_user(&val, uaddr, sys_reg_to_index(rd));
+       if (err)
+               return err;
+
+       return 0;
+}
+
  static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                        const struct sys_reg_desc *r)
  {
@@ -1592,16 +1637,21 @@ static const struct sys_reg_desc sys_reg_descs[] = {
           .access = access_pmcnten, .reg = PMCNTENSET_EL0 },
         { PMU_SYS_REG(SYS_PMOVSCLR_EL0),
           .access = access_pmovs, .reg = PMOVSSET_EL0 },
+       /*
+        * PM_SWINC_EL0 is exposed to userspace as RAZ/WI, as it was
+        * previously (and pointlessly) advertised in the past...
+        */
         { PMU_SYS_REG(SYS_PMSWINC_EL0),
-         .access = access_pmswinc, .reg = PMSWINC_EL0 },
+         .get_user = get_raz_id_reg, .set_user = set_wi_reg,
+         .access = access_pmswinc, .reset = NULL },
         { PMU_SYS_REG(SYS_PMSELR_EL0),
-         .access = access_pmselr, .reg = PMSELR_EL0 },
+         .access = access_pmselr, .reset = reset_pmselr, .reg = PMSELR_EL0 },
         { PMU_SYS_REG(SYS_PMCEID0_EL0),
           .access = access_pmceid, .reset = NULL },
         { PMU_SYS_REG(SYS_PMCEID1_EL0),
           .access = access_pmceid, .reset = NULL },
         { PMU_SYS_REG(SYS_PMCCNTR_EL0),
-         .access = access_pmu_evcntr, .reg = PMCCNTR_EL0 },
+         .access = access_pmu_evcntr, .reset = reset_unknown, .reg = PMCCNTR_EL0 },
         { PMU_SYS_REG(SYS_PMXEVTYPER_EL0),
           .access = access_pmu_evtyper, .reset = NULL },
         { PMU_SYS_REG(SYS_PMXEVCNTR_EL0),
@@ -2106,23 +2156,6 @@ static int check_sysreg_table(const struct sys_reg_desc *table, unsigned int n,
         return 0;
  }
  
-static int match_sys_reg(const void *key, const void *elt)
-{
-       const unsigned long pval = (unsigned long)key;
-       const struct sys_reg_desc *r = elt;
-
-       return pval - reg_to_encoding(r);
-}
-
-static const struct sys_reg_desc *find_reg(const struct sys_reg_params *params,
-                                        const struct sys_reg_desc table[],
-                                        unsigned int num)
-{
-       unsigned long pval = reg_to_encoding(params);
-
-       return bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg);
-}
-
  int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu)
  {
         kvm_inject_undefined(vcpu);
@@ -2365,13 +2398,8 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu)
  
         trace_kvm_handle_sys_reg(esr);
  
-       params.Op0 = (esr >> 20) & 3;
-       params.Op1 = (esr >> 14) & 0x7;
-       params.CRn = (esr >> 10) & 0xf;
-       params.CRm = (esr >> 1) & 0xf;
-       params.Op2 = (esr >> 17) & 0x7;
+       params = esr_sys64_to_params(esr);
         params.regval = vcpu_get_reg(vcpu, Rt);
-       params.is_write = !(esr & 1);
  
         ret = emulate_sys_reg(vcpu, &params);
  
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h

index 9d06214..cc0cc95 100644 (file)
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -11,6 +11,12 @@
  #ifndef __ARM64_KVM_SYS_REGS_LOCAL_H__
  #define __ARM64_KVM_SYS_REGS_LOCAL_H__
  
+#include <linux/bsearch.h>
+
+#define reg_to_encoding(x)                                             \
+       sys_reg((u32)(x)->Op0, (u32)(x)->Op1,                           \
+               (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2)
+
  struct sys_reg_params {
         u8      Op0;
         u8      Op1;
@@ -21,6 +27,14 @@ struct sys_reg_params {
         bool    is_write;
  };
  
+#define esr_sys64_to_params(esr)                                               \
+       ((struct sys_reg_params){ .Op0 = ((esr) >> 20) & 3,                    \
+                                 .Op1 = ((esr) >> 14) & 0x7,                  \
+                                 .CRn = ((esr) >> 10) & 0xf,                  \
+                                 .CRm = ((esr) >> 1) & 0xf,                   \
+                                 .Op2 = ((esr) >> 17) & 0x7,                  \
+                                 .is_write = !((esr) & 1) })
+
  struct sys_reg_desc {
         /* Sysreg string for debug */
         const char *name;
@@ -152,6 +166,23 @@ static inline int cmp_sys_reg(const struct sys_reg_desc *i1,
         return i1->Op2 - i2->Op2;
  }
  
+static inline int match_sys_reg(const void *key, const void *elt)
+{
+       const unsigned long pval = (unsigned long)key;
+       const struct sys_reg_desc *r = elt;
+
+       return pval - reg_to_encoding(r);
+}
+
+static inline const struct sys_reg_desc *
+find_reg(const struct sys_reg_params *params, const struct sys_reg_desc table[],
+        unsigned int num)
+{
+       unsigned long pval = reg_to_encoding(params);
+
+       return __inline_bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg);
+}
+
  const struct sys_reg_desc *find_reg_by_id(u64 id,
                                           struct sys_reg_params *params,
                                           const struct sys_reg_desc table[],
diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h

index 8d78acc..064a58c 100644 (file)
--- a/arch/arm64/kvm/trace_handle_exit.h
+++ b/arch/arm64/kvm/trace_handle_exit.h
@@ -78,13 +78,17 @@ TRACE_EVENT(kvm_arm_clear_debug,
         TP_printk("flags: 0x%08x", __entry->guest_debug)
  );
  
+/*
+ * The dreg32 name is a leftover from a distant past. This will really
+ * output a 64bit value...
+ */
  TRACE_EVENT(kvm_arm_set_dreg32,
-       TP_PROTO(const char *name, __u32 value),
+       TP_PROTO(const char *name, __u64 value),
         TP_ARGS(name, value),
  
         TP_STRUCT__entry(
                 __field(const char *, name)
-               __field(__u32, value)
+               __field(__u64, value)
         ),
  
         TP_fast_assign(
@@ -92,7 +96,7 @@ TRACE_EVENT(kvm_arm_set_dreg32,
                 __entry->value = value;
         ),
  
-       TP_printk("%s: 0x%08x", __entry->name, __entry->value)
+       TP_printk("%s: 0x%llx", __entry->name, __entry->value)
  );
  
  TRACE_DEFINE_SIZEOF(__u64);
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c

index a016f07..5f9014a 100644 (file)
--- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
@@ -282,7 +282,7 @@ static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu,
         case GIC_CPU_PRIMASK:
                 /*
                  * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
-                * the PMR field as GICH_VMCR.VMPriMask rather than
+                * PMR field as GICH_VMCR.VMPriMask rather than
                  * GICC_PMR.Priority, so we expose the upper five bits of
                  * priority mask to userspace using the lower bits in the
                  * unsigned long.
@@ -329,7 +329,7 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu,
         case GIC_CPU_PRIMASK:
                 /*
                  * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
-                * the PMR field as GICH_VMCR.VMPriMask rather than
+                * PMR field as GICH_VMCR.VMPriMask rather than
                  * GICC_PMR.Priority, so we expose the upper five bits of
                  * priority mask to userspace using the lower bits in the
                  * unsigned long.
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c

index 2c58020..95a18ce 100644 (file)
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -60,6 +60,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
                 u32 val = cpuif->vgic_lr[lr];
                 u32 cpuid, intid = val & GICH_LR_VIRTUALID;
                 struct vgic_irq *irq;
+               bool deactivated;
  
                 /* Extract the source vCPU id from the LR */
                 cpuid = val & GICH_LR_PHYSID_CPUID;
@@ -75,7 +76,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
  
                 raw_spin_lock(&irq->irq_lock);
  
-               /* Always preserve the active bit */
+               /* Always preserve the active bit, note deactivation */
+               deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT);
                 irq->active = !!(val & GICH_LR_ACTIVE_BIT);
  
                 if (irq->active && vgic_irq_is_sgi(intid))
@@ -96,36 +98,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
                 if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE))
                         irq->pending_latch = false;
  
-               /*
-                * Level-triggered mapped IRQs are special because we only
-                * observe rising edges as input to the VGIC.
-                *
-                * If the guest never acked the interrupt we have to sample
-                * the physical line and set the line level, because the
-                * device state could have changed or we simply need to
-                * process the still pending interrupt later.
-                *
-                * If this causes us to lower the level, we have to also clear
-                * the physical active state, since we will otherwise never be
-                * told when the interrupt becomes asserted again.
-                *
-                * Another case is when the interrupt requires a helping hand
-                * on deactivation (no HW deactivation, for example).
-                */
-               if (vgic_irq_is_mapped_level(irq)) {
-                       bool resample = false;
-
-                       if (val & GICH_LR_PENDING_BIT) {
-                               irq->line_level = vgic_get_phys_line_level(irq);
-                               resample = !irq->line_level;
-                       } else if (vgic_irq_needs_resampling(irq) &&
-                                  !(irq->active || irq->pending_latch)) {
-                               resample = true;
-                       }
-
-                       if (resample)
-                               vgic_irq_set_phys_active(irq, false);
-               }
+               /* Handle resampling for mapped interrupts if required */
+               vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT);
  
                 raw_spin_unlock(&irq->irq_lock);
                 vgic_put_irq(vcpu->kvm, irq);
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c

index 66004f6..21a6207 100644 (file)
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -46,6 +46,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
                 u32 intid, cpuid;
                 struct vgic_irq *irq;
                 bool is_v2_sgi = false;
+               bool deactivated;
  
                 cpuid = val & GICH_LR_PHYSID_CPUID;
                 cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
@@ -68,7 +69,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
  
                 raw_spin_lock(&irq->irq_lock);
  
-               /* Always preserve the active bit */
+               /* Always preserve the active bit, note deactivation */
+               deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT);
                 irq->active = !!(val & ICH_LR_ACTIVE_BIT);
  
                 if (irq->active && is_v2_sgi)
@@ -89,36 +91,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
                 if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
                         irq->pending_latch = false;
  
-               /*
-                * Level-triggered mapped IRQs are special because we only
-                * observe rising edges as input to the VGIC.
-                *
-                * If the guest never acked the interrupt we have to sample
-                * the physical line and set the line level, because the
-                * device state could have changed or we simply need to
-                * process the still pending interrupt later.
-                *
-                * If this causes us to lower the level, we have to also clear
-                * the physical active state, since we will otherwise never be
-                * told when the interrupt becomes asserted again.
-                *
-                * Another case is when the interrupt requires a helping hand
-                * on deactivation (no HW deactivation, for example).
-                */
-               if (vgic_irq_is_mapped_level(irq)) {
-                       bool resample = false;
-
-                       if (val & ICH_LR_PENDING_BIT) {
-                               irq->line_level = vgic_get_phys_line_level(irq);
-                               resample = !irq->line_level;
-                       } else if (vgic_irq_needs_resampling(irq) &&
-                                  !(irq->active || irq->pending_latch)) {
-                               resample = true;
-                       }
-
-                       if (resample)
-                               vgic_irq_set_phys_active(irq, false);
-               }
+               /* Handle resampling for mapped interrupts if required */
+               vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT);
  
                 raw_spin_unlock(&irq->irq_lock);
                 vgic_put_irq(vcpu->kvm, irq);
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c

index 111bff4..5dad499 100644 (file)
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -106,7 +106,6 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
         if (intid >= VGIC_MIN_LPI)
                 return vgic_get_lpi(kvm, intid);
  
-       WARN(1, "Looking up struct vgic_irq for reserved INTID");
         return NULL;
  }
  
@@ -1022,3 +1021,41 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid)
  
         return map_is_active;
  }
+
+/*
+ * Level-triggered mapped IRQs are special because we only observe rising
+ * edges as input to the VGIC.
+ *
+ * If the guest never acked the interrupt we have to sample the physical
+ * line and set the line level, because the device state could have changed
+ * or we simply need to process the still pending interrupt later.
+ *
+ * We could also have entered the guest with the interrupt active+pending.
+ * On the next exit, we need to re-evaluate the pending state, as it could
+ * otherwise result in a spurious interrupt by injecting a now potentially
+ * stale pending state.
+ *
+ * If this causes us to lower the level, we have to also clear the physical
+ * active state, since we will otherwise never be told when the interrupt
+ * becomes asserted again.
+ *
+ * Another case is when the interrupt requires a helping hand on
+ * deactivation (no HW deactivation, for example).
+ */
+void vgic_irq_handle_resampling(struct vgic_irq *irq,
+                               bool lr_deactivated, bool lr_pending)
+{
+       if (vgic_irq_is_mapped_level(irq)) {
+               bool resample = false;
+
+               if (unlikely(vgic_irq_needs_resampling(irq))) {
+                       resample = !(irq->active || irq->pending_latch);
+               } else if (lr_pending || (lr_deactivated && irq->line_level)) {
+                       irq->line_level = vgic_get_phys_line_level(irq);
+                       resample = !irq->line_level;
+               }
+
+               if (resample)
+                       vgic_irq_set_phys_active(irq, false);
+       }
+}
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h

index dc1f3d1..14a9218 100644 (file)
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -169,6 +169,8 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
  bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
                            unsigned long flags);
  void vgic_kick_vcpus(struct kvm *kvm);
+void vgic_irq_handle_resampling(struct vgic_irq *irq,
+                               bool lr_deactivated, bool lr_pending);
  
  int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
                       phys_addr_t addr, phys_addr_t alignment);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c

index af9dd02..75c6f26 100644 (file)
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -41,8 +41,6 @@
  const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
         KVM_GENERIC_VM_STATS()
  };
-static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
-               sizeof(struct kvm_vm_stat) / sizeof(u64));
  
  const struct kvm_stats_header kvm_vm_stats_header = {
         .name_size = KVM_STATS_NAME_SIZE,
@@ -85,8 +83,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
         STATS_DESC_COUNTER(VCPU, vz_cpucfg_exits),
  #endif
  };
-static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
-               sizeof(struct kvm_vcpu_stat) / sizeof(u64));
  
  const struct kvm_stats_header kvm_vcpu_stats_header = {
         .name_size = KVM_STATS_NAME_SIZE,
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c

index 43cad10..4adca5a 100644 (file)
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -388,7 +388,6 @@ static void _kvm_vz_restore_htimer(struct kvm_vcpu *vcpu,
                                    u32 compare, u32 cause)
  {
         u32 start_count, after_count;
-       ktime_t freeze_time;
         unsigned long flags;
  
         /*
@@ -396,7 +395,7 @@ static void _kvm_vz_restore_htimer(struct kvm_vcpu *vcpu,
          * this with interrupts disabled to avoid latency.
          */
         local_irq_save(flags);
-       freeze_time = kvm_mips_freeze_hrtimer(vcpu, &start_count);
+       kvm_mips_freeze_hrtimer(vcpu, &start_count);
         write_c0_gtoffset(start_count - read_c0_count());
         local_irq_restore(flags);
  
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h

index a779f78..080a7fe 100644 (file)
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -103,7 +103,6 @@ struct kvm_vcpu_stat {
         u64 emulated_inst_exits;
         u64 dec_exits;
         u64 ext_intr_exits;
-       u64 halt_wait_ns;
         u64 halt_successful_wait;
         u64 dbell_exits;
         u64 gdbell_exits;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c

index 79833f7..b785f67 100644 (file)
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -43,8 +43,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
         STATS_DESC_ICOUNTER(VM, num_2M_pages),
         STATS_DESC_ICOUNTER(VM, num_1G_pages)
  };
-static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
-               sizeof(struct kvm_vm_stat) / sizeof(u64));
  
  const struct kvm_stats_header kvm_vm_stats_header = {
         .name_size = KVM_STATS_NAME_SIZE,
@@ -71,7 +69,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
         STATS_DESC_COUNTER(VCPU, emulated_inst_exits),
         STATS_DESC_COUNTER(VCPU, dec_exits),
         STATS_DESC_COUNTER(VCPU, ext_intr_exits),
-       STATS_DESC_TIME_NSEC(VCPU, halt_wait_ns),
         STATS_DESC_COUNTER(VCPU, halt_successful_wait),
         STATS_DESC_COUNTER(VCPU, dbell_exits),
         STATS_DESC_COUNTER(VCPU, gdbell_exits),
@@ -88,8 +85,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
         STATS_DESC_COUNTER(VCPU, pthru_host),
         STATS_DESC_COUNTER(VCPU, pthru_bad_aff)
  };
-static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
-               sizeof(struct kvm_vcpu_stat) / sizeof(u64));
  
  const struct kvm_stats_header kvm_vcpu_stats_header = {
         .name_size = KVM_STATS_NAME_SIZE,
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c

index 8da93fd..6365087 100644 (file)
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -346,7 +346,7 @@ static long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
         unsigned long gfn = tce >> PAGE_SHIFT;
         struct kvm_memory_slot *memslot;
  
-       memslot = search_memslots(kvm_memslots(kvm), gfn);
+       memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
         if (!memslot)
                 return -EINVAL;
  
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c

index 636c6ae..870b7f0 100644 (file)
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -80,7 +80,7 @@ static long kvmppc_rm_tce_to_ua(struct kvm *kvm,
         unsigned long gfn = tce >> PAGE_SHIFT;
         struct kvm_memory_slot *memslot;
  
-       memslot = search_memslots(kvm_memslots_raw(kvm), gfn);
+       memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
         if (!memslot)
                 return -EINVAL;
  
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c

index bb0dacf..2acb1c9 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4202,19 +4202,31 @@ out:
  
         /* Attribute wait time */
         if (do_sleep) {
-               vc->runner->stat.halt_wait_ns +=
+               vc->runner->stat.generic.halt_wait_ns +=
                         ktime_to_ns(cur) - ktime_to_ns(start_wait);
+               KVM_STATS_LOG_HIST_UPDATE(
+                               vc->runner->stat.generic.halt_wait_hist,
+                               ktime_to_ns(cur) - ktime_to_ns(start_wait));
                 /* Attribute failed poll time */
-               if (vc->halt_poll_ns)
+               if (vc->halt_poll_ns) {
                         vc->runner->stat.generic.halt_poll_fail_ns +=
                                 ktime_to_ns(start_wait) -
                                 ktime_to_ns(start_poll);
+                       KVM_STATS_LOG_HIST_UPDATE(
+                               vc->runner->stat.generic.halt_poll_fail_hist,
+                               ktime_to_ns(start_wait) -
+                               ktime_to_ns(start_poll));
+               }
         } else {
                 /* Attribute successful poll time */
-               if (vc->halt_poll_ns)
+               if (vc->halt_poll_ns) {
                         vc->runner->stat.generic.halt_poll_success_ns +=
                                 ktime_to_ns(cur) -
                                 ktime_to_ns(start_poll);
+                       KVM_STATS_LOG_HIST_UPDATE(
+                               vc->runner->stat.generic.halt_poll_success_hist,
+                               ktime_to_ns(cur) - ktime_to_ns(start_poll));
+               }
         }
  
         /* Adjust poll time */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c

index 551b30d..977801c 100644 (file)
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -41,8 +41,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
         STATS_DESC_ICOUNTER(VM, num_2M_pages),
         STATS_DESC_ICOUNTER(VM, num_1G_pages)
  };
-static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
-               sizeof(struct kvm_vm_stat) / sizeof(u64));
  
  const struct kvm_stats_header kvm_vm_stats_header = {
         .name_size = KVM_STATS_NAME_SIZE,
@@ -69,7 +67,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
         STATS_DESC_COUNTER(VCPU, emulated_inst_exits),
         STATS_DESC_COUNTER(VCPU, dec_exits),
         STATS_DESC_COUNTER(VCPU, ext_intr_exits),
-       STATS_DESC_TIME_NSEC(VCPU, halt_wait_ns),
         STATS_DESC_COUNTER(VCPU, halt_successful_wait),
         STATS_DESC_COUNTER(VCPU, dbell_exits),
         STATS_DESC_COUNTER(VCPU, gdbell_exits),
@@ -79,8 +76,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
         STATS_DESC_COUNTER(VCPU, pthru_host),
         STATS_DESC_COUNTER(VCPU, pthru_bad_aff)
  };
-static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
-               sizeof(struct kvm_vcpu_stat) / sizeof(u64));
  
  const struct kvm_stats_header kvm_vcpu_stats_header = {
         .name_size = KVM_STATS_NAME_SIZE,
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h

index d681ae4..a604d51 100644 (file)
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -244,6 +244,7 @@ struct kvm_s390_sie_block {
         __u8    fpf;                    /* 0x0060 */
  #define ECB_GS         0x40
  #define ECB_TE         0x10
+#define ECB_SPECI      0x08
  #define ECB_SRSI       0x04
  #define ECB_HOSTPROTINT        0x02
         __u8    ecb;                    /* 0x0061 */
@@ -955,6 +956,7 @@ struct kvm_arch{
         atomic64_t cmma_dirty_pages;
         /* subset of available cpu features enabled by user space */
         DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+       /* indexed by vcpu_idx */
         DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS);
         struct kvm_s390_gisa_interrupt gisa_int;
         struct kvm_s390_pv pv;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c

index d548d60..16256e1 100644 (file)
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -419,13 +419,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
  static void __set_cpu_idle(struct kvm_vcpu *vcpu)
  {
         kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT);
-       set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+       set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
  }
  
  static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
  {
         kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT);
-       clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+       clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
  }
  
  static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
@@ -3050,18 +3050,18 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len)
  
  static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask)
  {
-       int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus);
+       int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus);
         struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
         struct kvm_vcpu *vcpu;
  
-       for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) {
-               vcpu = kvm_get_vcpu(kvm, vcpu_id);
+       for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) {
+               vcpu = kvm_get_vcpu(kvm, vcpu_idx);
                 if (psw_ioint_disabled(vcpu))
                         continue;
                 deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24);
                 if (deliverable_mask) {
                         /* lately kicked but not yet running */
-                       if (test_and_set_bit(vcpu_id, gi->kicked_mask))
+                       if (test_and_set_bit(vcpu_idx, gi->kicked_mask))
                                 return;
                         kvm_s390_vcpu_wakeup(vcpu);
                         return;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c

index efda061..752a0ff 100644 (file)
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -66,8 +66,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
         STATS_DESC_COUNTER(VM, inject_service_signal),
         STATS_DESC_COUNTER(VM, inject_virtio)
  };
-static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
-               sizeof(struct kvm_vm_stat) / sizeof(u64));
  
  const struct kvm_stats_header kvm_vm_stats_header = {
         .name_size = KVM_STATS_NAME_SIZE,
@@ -174,8 +172,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
         STATS_DESC_COUNTER(VCPU, instruction_diagnose_other),
         STATS_DESC_COUNTER(VCPU, pfault_sync)
  };
-static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
-               sizeof(struct kvm_vcpu_stat) / sizeof(u64));
  
  const struct kvm_stats_header kvm_vcpu_stats_header = {
         .name_size = KVM_STATS_NAME_SIZE,
@@ -1953,7 +1949,7 @@ out:
  static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn)
  {
         int start = 0, end = slots->used_slots;
-       int slot = atomic_read(&slots->lru_slot);
+       int slot = atomic_read(&slots->last_used_slot);
         struct kvm_memory_slot *memslots = slots->memslots;
  
         if (gfn >= memslots[slot].base_gfn &&
@@ -1974,7 +1970,7 @@ static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn)
  
         if (gfn >= memslots[start].base_gfn &&
             gfn < memslots[start].base_gfn + memslots[start].npages) {
-               atomic_set(&slots->lru_slot, start);
+               atomic_set(&slots->last_used_slot, start);
         }
  
         return start;
@@ -3224,6 +3220,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
                 vcpu->arch.sie_block->ecb |= ECB_SRSI;
         if (test_kvm_facility(vcpu->kvm, 73))
                 vcpu->arch.sie_block->ecb |= ECB_TE;
+       if (!kvm_is_ucontrol(vcpu->kvm))
+               vcpu->arch.sie_block->ecb |= ECB_SPECI;
  
         if (test_kvm_facility(vcpu->kvm, 8) && vcpu->kvm->arch.use_pfmfi)
                 vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI;
@@ -4068,7 +4066,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
                 kvm_s390_patch_guest_per_regs(vcpu);
         }
  
-       clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask);
+       clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.gisa_int.kicked_mask);
  
         vcpu->arch.sie_block->icptcode = 0;
         cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h

index 9fad251..ecd741e 100644 (file)
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -79,7 +79,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
  
  static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
  {
-       return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+       return test_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
  }
  
  static inline int kvm_is_ucontrol(struct kvm *kvm)
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c

index 4002a24..acda4b6 100644 (file)
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -510,6 +510,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
                         prefix_unmapped(vsie_page);
                 scb_s->ecb |= ECB_TE;
         }
+       /* specification exception interpretation */
+       scb_s->ecb |= scb_o->ecb & ECB_SPECI;
         /* branch prediction */
         if (test_kvm_facility(vcpu->kvm, 82))
                 scb_s->fpf |= scb_o->fpf & FPF_BPBC;
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h

index a12a498..cefe1d8 100644 (file)
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -72,7 +72,6 @@ KVM_X86_OP(enable_nmi_window)
  KVM_X86_OP(enable_irq_window)
  KVM_X86_OP(update_cr8_intercept)
  KVM_X86_OP(check_apicv_inhibit_reasons)
-KVM_X86_OP_NULL(pre_update_apicv_exec_ctrl)
  KVM_X86_OP(refresh_apicv_exec_ctrl)
  KVM_X86_OP(hwapic_irr_update)
  KVM_X86_OP(hwapic_isr_update)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index af6ce8d..f8f48a7 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -37,9 +37,21 @@
  
  #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
  
-#define KVM_MAX_VCPUS 288
-#define KVM_SOFT_MAX_VCPUS 240
-#define KVM_MAX_VCPU_ID 1023
+#define KVM_MAX_VCPUS 1024
+#define KVM_SOFT_MAX_VCPUS 710
+
+/*
+ * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs
+ * might be larger than the actual number of VCPUs because the
+ * APIC ID encodes CPU topology information.
+ *
+ * In the worst case, we'll need less than one extra bit for the
+ * Core ID, and less than one extra bit for the Package (Die) ID,
+ * so ratio of 4 should be enough.
+ */
+#define KVM_VCPU_ID_RATIO 4
+#define KVM_MAX_VCPU_ID (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO)
+
  /* memory slots that are not exposed to userspace */
  #define KVM_PRIVATE_MEM_SLOTS 3
  
@@ -124,13 +136,6 @@
  #define KVM_HPAGE_MASK(x)      (~(KVM_HPAGE_SIZE(x) - 1))
  #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
  
-static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
-{
-       /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */
-       return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
-               (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
-}
-
  #define KVM_PERMILLE_MMU_PAGES 20
  #define KVM_MIN_ALLOC_MMU_PAGES 64UL
  #define KVM_MMU_HASH_SHIFT 12
@@ -229,7 +234,8 @@ enum x86_intercept_stage;
         KVM_GUESTDBG_USE_HW_BP | \
         KVM_GUESTDBG_USE_SW_BP | \
         KVM_GUESTDBG_INJECT_BP | \
-       KVM_GUESTDBG_INJECT_DB)
+       KVM_GUESTDBG_INJECT_DB | \
+       KVM_GUESTDBG_BLOCKIRQ)
  
  
  #define PFERR_PRESENT_BIT 0
@@ -447,6 +453,7 @@ struct kvm_mmu {
  
         u64 *pae_root;
         u64 *pml4_root;
+       u64 *pml5_root;
  
         /*
          * check zero bits on shadow page table entries, these
@@ -482,6 +489,7 @@ struct kvm_pmc {
          * ctrl value for fixed counters.
          */
         u64 current_config;
+       bool is_paused;
  };
  
  struct kvm_pmu {
@@ -522,7 +530,6 @@ struct kvm_pmu_ops;
  enum {
         KVM_DEBUGREG_BP_ENABLED = 1,
         KVM_DEBUGREG_WONT_EXIT = 2,
-       KVM_DEBUGREG_RELOAD = 4,
  };
  
  struct kvm_mtrr_range {
@@ -723,7 +730,6 @@ struct kvm_vcpu_arch {
  
         u64 reserved_gpa_bits;
         int maxphyaddr;
-       int max_tdp_level;
  
         /* emulate context */
  
@@ -988,6 +994,12 @@ struct kvm_hv {
         /* How many vCPUs have VP index != vCPU index */
         atomic_t num_mismatched_vp_indexes;
  
+       /*
+        * How many SynICs use 'AutoEOI' feature
+        * (protected by arch.apicv_update_lock)
+        */
+       unsigned int synic_auto_eoi_used;
+
         struct hv_partition_assist_pg *hv_pa_pg;
         struct kvm_hv_syndbg hv_syndbg;
  };
@@ -1002,9 +1014,8 @@ struct msr_bitmap_range {
  /* Xen emulation context */
  struct kvm_xen {
         bool long_mode;
-       bool shinfo_set;
         u8 upcall_vector;
-       struct gfn_to_hva_cache shinfo_cache;
+       gfn_t shinfo_gfn;
  };
  
  enum kvm_irqchip_mode {
@@ -1061,6 +1072,9 @@ struct kvm_arch {
         struct kvm_apic_map __rcu *apic_map;
         atomic_t apic_map_dirty;
  
+       /* Protects apic_access_memslot_enabled and apicv_inhibit_reasons */
+       struct mutex apicv_update_lock;
+
         bool apic_access_memslot_enabled;
         unsigned long apicv_inhibit_reasons;
  
@@ -1213,9 +1227,17 @@ struct kvm_vm_stat {
         u64 mmu_recycled;
         u64 mmu_cache_miss;
         u64 mmu_unsync;
-       u64 lpages;
+       union {
+               struct {
+                       atomic64_t pages_4k;
+                       atomic64_t pages_2m;
+                       atomic64_t pages_1g;
+               };
+               atomic64_t pages[KVM_NR_PAGE_SIZES];
+       };
         u64 nx_lpage_splits;
         u64 max_mmu_page_hash_collisions;
+       u64 max_mmu_rmap_size;
  };
  
  struct kvm_vcpu_stat {
@@ -1359,7 +1381,6 @@ struct kvm_x86_ops {
         void (*enable_irq_window)(struct kvm_vcpu *vcpu);
         void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
         bool (*check_apicv_inhibit_reasons)(ulong bit);
-       void (*pre_update_apicv_exec_ctrl)(struct kvm *kvm, bool activate);
         void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
         void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
         void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
@@ -1543,12 +1564,12 @@ void kvm_mmu_uninit_vm(struct kvm *kvm);
  void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu);
  void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
  void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
-                                     struct kvm_memory_slot *memslot,
+                                     const struct kvm_memory_slot *memslot,
                                       int start_level);
  void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
                                    const struct kvm_memory_slot *memslot);
  void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
-                                  struct kvm_memory_slot *memslot);
+                                  const struct kvm_memory_slot *memslot);
  void kvm_mmu_zap_all(struct kvm *kvm);
  void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
  unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
@@ -1744,6 +1765,9 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
  void kvm_request_apicv_update(struct kvm *kvm, bool activate,
                               unsigned long bit);
  
+void __kvm_request_apicv_update(struct kvm *kvm, bool activate,
+                               unsigned long bit);
+
  int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
  
  int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
@@ -1754,8 +1778,8 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
  void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
  void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
  
-void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
-                      int tdp_huge_page_level);
+void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
+                      int tdp_max_root_level, int tdp_huge_page_level);
  
  static inline u16 kvm_read_ldt(void)
  {
@@ -1779,11 +1803,6 @@ static inline unsigned long read_msr(unsigned long msr)
  }
  #endif
  
-static inline u32 get_rdx_init_val(void)
-{
-       return 0x600; /* P6 family */
-}
-
  static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
  {
         kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
@@ -1816,31 +1835,6 @@ enum {
  #define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
  #define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
  
-asmlinkage void kvm_spurious_fault(void);
-
-/*
- * Hardware virtualization extension instructions may fault if a
- * reboot turns off virtualization while processes are running.
- * Usually after catching the fault we just panic; during reboot
- * instead the instruction is ignored.
- */
-#define __kvm_handle_fault_on_reboot(insn)                             \
-       "666: \n\t"                                                     \
-       insn "\n\t"                                                     \
-       "jmp    668f \n\t"                                              \
-       "667: \n\t"                                                     \
-       "1: \n\t"                                                       \
-       ".pushsection .discard.instr_begin \n\t"                        \
-       ".long 1b - . \n\t"                                             \
-       ".popsection \n\t"                                              \
-       "call   kvm_spurious_fault \n\t"                                \
-       "1: \n\t"                                                       \
-       ".pushsection .discard.instr_end \n\t"                          \
-       ".long 1b - . \n\t"                                             \
-       ".popsection \n\t"                                              \
-       "668: \n\t"                                                     \
-       _ASM_EXTABLE(666b, 667b)
-
  #define KVM_ARCH_WANT_MMU_NOTIFIER
  
  int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h

index a6c327f..2ef1f65 100644 (file)
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -295,6 +295,7 @@ struct kvm_debug_exit_arch {
  #define KVM_GUESTDBG_USE_HW_BP         0x00020000
  #define KVM_GUESTDBG_INJECT_DB         0x00040000
  #define KVM_GUESTDBG_INJECT_BP         0x00080000
+#define KVM_GUESTDBG_BLOCKIRQ          0x00100000
  
  /* for KVM_SET_GUEST_DEBUG */
  struct kvm_guest_debug_arch {
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c

index a26643d..b656456 100644 (file)
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -884,10 +884,11 @@ static void kvm_wait(u8 *ptr, u8 val)
         } else {
                 local_irq_disable();
  
+               /* safe_halt() will enable IRQ */
                 if (READ_ONCE(*ptr) == val)
                         safe_halt();
-
-               local_irq_enable();
+               else
+                       local_irq_enable();
         }
  }
  
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c

index 95a9841..54a83a7 100644 (file)
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -7,6 +7,8 @@
  #include <linux/kvm_host.h>
  #include <linux/debugfs.h>
  #include "lapic.h"
+#include "mmu.h"
+#include "mmu/mmu_internal.h"
  
  static int vcpu_get_timer_advance_ns(void *data, u64 *val)
  {
@@ -73,3 +75,112 @@ void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_
                                     &vcpu_tsc_scaling_frac_fops);
         }
  }
+
+/*
+ * This covers statistics <1024 (11=log(1024)+1), which should be enough to
+ * cover RMAP_RECYCLE_THRESHOLD.
+ */
+#define  RMAP_LOG_SIZE  11
+
+static const char *kvm_lpage_str[KVM_NR_PAGE_SIZES] = { "4K", "2M", "1G" };
+
+static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v)
+{
+       struct kvm_rmap_head *rmap;
+       struct kvm *kvm = m->private;
+       struct kvm_memory_slot *slot;
+       struct kvm_memslots *slots;
+       unsigned int lpage_size, index;
+       /* Still small enough to be on the stack */
+       unsigned int *log[KVM_NR_PAGE_SIZES], *cur;
+       int i, j, k, l, ret;
+
+       ret = -ENOMEM;
+       memset(log, 0, sizeof(log));
+       for (i = 0; i < KVM_NR_PAGE_SIZES; i++) {
+               log[i] = kcalloc(RMAP_LOG_SIZE, sizeof(unsigned int), GFP_KERNEL);
+               if (!log[i])
+                       goto out;
+       }
+
+       mutex_lock(&kvm->slots_lock);
+       write_lock(&kvm->mmu_lock);
+
+       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+               slots = __kvm_memslots(kvm, i);
+               for (j = 0; j < slots->used_slots; j++) {
+                       slot = &slots->memslots[j];
+                       for (k = 0; k < KVM_NR_PAGE_SIZES; k++) {
+                               rmap = slot->arch.rmap[k];
+                               lpage_size = kvm_mmu_slot_lpages(slot, k + 1);
+                               cur = log[k];
+                               for (l = 0; l < lpage_size; l++) {
+                                       index = ffs(pte_list_count(&rmap[l]));
+                                       if (WARN_ON_ONCE(index >= RMAP_LOG_SIZE))
+                                               index = RMAP_LOG_SIZE - 1;
+                                       cur[index]++;
+                               }
+                       }
+               }
+       }
+
+       write_unlock(&kvm->mmu_lock);
+       mutex_unlock(&kvm->slots_lock);
+
+       /* index=0 counts no rmap; index=1 counts 1 rmap */
+       seq_printf(m, "Rmap_Count:\t0\t1\t");
+       for (i = 2; i < RMAP_LOG_SIZE; i++) {
+               j = 1 << (i - 1);
+               k = (1 << i) - 1;
+               seq_printf(m, "%d-%d\t", j, k);
+       }
+       seq_printf(m, "\n");
+
+       for (i = 0; i < KVM_NR_PAGE_SIZES; i++) {
+               seq_printf(m, "Level=%s:\t", kvm_lpage_str[i]);
+               cur = log[i];
+               for (j = 0; j < RMAP_LOG_SIZE; j++)
+                       seq_printf(m, "%d\t", cur[j]);
+               seq_printf(m, "\n");
+       }
+
+       ret = 0;
+out:
+       for (i = 0; i < KVM_NR_PAGE_SIZES; i++)
+               kfree(log[i]);
+
+       return ret;
+}
+
+static int kvm_mmu_rmaps_stat_open(struct inode *inode, struct file *file)
+{
+       struct kvm *kvm = inode->i_private;
+
+       if (!kvm_get_kvm_safe(kvm))
+               return -ENOENT;
+
+       return single_open(file, kvm_mmu_rmaps_stat_show, kvm);
+}
+
+static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file)
+{
+       struct kvm *kvm = inode->i_private;
+
+       kvm_put_kvm(kvm);
+
+       return single_release(inode, file);
+}
+
+static const struct file_operations mmu_rmaps_stat_fops = {
+       .open           = kvm_mmu_rmaps_stat_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = kvm_mmu_rmaps_stat_release,
+};
+
+int kvm_arch_create_vm_debugfs(struct kvm *kvm)
+{
+       debugfs_create_file("mmu_rmaps_stat", 0644, kvm->debugfs_dentry, kvm,
+                           &mmu_rmaps_stat_fops);
+       return 0;
+}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c

index 41d2a53..232a86a 100644 (file)
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -88,6 +88,10 @@ static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
  static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
                                 int vector)
  {
+       struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+       struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+       int auto_eoi_old, auto_eoi_new;
+
         if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
                 return;
  
@@ -96,10 +100,30 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
         else
                 __clear_bit(vector, synic->vec_bitmap);
  
+       auto_eoi_old = bitmap_weight(synic->auto_eoi_bitmap, 256);
+
         if (synic_has_vector_auto_eoi(synic, vector))
                 __set_bit(vector, synic->auto_eoi_bitmap);
         else
                 __clear_bit(vector, synic->auto_eoi_bitmap);
+
+       auto_eoi_new = bitmap_weight(synic->auto_eoi_bitmap, 256);
+
+       if (!!auto_eoi_old == !!auto_eoi_new)
+               return;
+
+       mutex_lock(&vcpu->kvm->arch.apicv_update_lock);
+
+       if (auto_eoi_new)
+               hv->synic_auto_eoi_used++;
+       else
+               hv->synic_auto_eoi_used--;
+
+       __kvm_request_apicv_update(vcpu->kvm,
+                                  !hv->synic_auto_eoi_used,
+                                  APICV_INHIBIT_REASON_HYPERV);
+
+       mutex_unlock(&vcpu->kvm->arch.apicv_update_lock);
  }
  
  static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
@@ -933,12 +957,6 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
  
         synic = to_hv_synic(vcpu);
  
-       /*
-        * Hyper-V SynIC auto EOI SINT's are
-        * not compatible with APICV, so request
-        * to deactivate APICV permanently.
-        */
-       kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV);
         synic->active = true;
         synic->dont_zero_synic_pages = dont_zero_synic_pages;
         synic->control = HV_SYNIC_CONTROL_ENABLE;
@@ -2476,6 +2494,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
                                 ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
                         if (!cpu_smt_possible())
                                 ent->eax |= HV_X64_NO_NONARCH_CORESHARING;
+
+                       ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED;
                         /*
                          * Default number of spinlock retry attempts, matches
                          * HyperV 2016.
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c

index a6e218c..5a69cce 100644 (file)
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -220,7 +220,8 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
         struct kvm_pit *pit = vcpu->kvm->arch.vpit;
         struct hrtimer *timer;
  
-       if (!kvm_vcpu_is_bsp(vcpu) || !pit)
+       /* Somewhat arbitrarily make vcpu0 the owner of the PIT. */
+       if (vcpu->vcpu_id || !pit)
                 return;
  
         timer = &pit->pit_state.timer;
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h

index 11e4065..bbd4a5d 100644 (file)
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -35,11 +35,7 @@ struct kvm_vcpu;
  #define        IOAPIC_INIT                     0x5
  #define        IOAPIC_EXTINT                   0x7
  
-#ifdef CONFIG_X86
  #define RTC_GSI 8
-#else
-#define RTC_GSI -1U
-#endif
  
  struct dest_map {
         /* vcpu bitmap where IRQ has been sent */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c

index ba5a278..76fb009 100644 (file)
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -192,6 +192,9 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
         if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN)
                 return;
  
+       WARN_ONCE(!irqchip_in_kernel(kvm),
+                 "Dirty APIC map without an in-kernel local APIC");
+
         mutex_lock(&kvm->arch.apic_map_lock);
         /*
          * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map
@@ -2265,9 +2268,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
         u64 old_value = vcpu->arch.apic_base;
         struct kvm_lapic *apic = vcpu->arch.apic;
  
-       if (!apic)
-               value |= MSR_IA32_APICBASE_BSP;
-
         vcpu->arch.apic_base = value;
  
         if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE)
@@ -2323,6 +2323,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
         struct kvm_lapic *apic = vcpu->arch.apic;
         int i;
  
+       if (!init_event) {
+               vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+                                      MSR_IA32_APICBASE_ENABLE;
+               if (kvm_vcpu_is_reset_bsp(vcpu))
+                       vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+       }
+
         if (!apic)
                 return;
  
@@ -2330,8 +2337,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
         hrtimer_cancel(&apic->lapic_timer.timer);
  
         if (!init_event) {
-               kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE |
-                                        MSR_IA32_APICBASE_ENABLE);
+               apic->base_address = APIC_DEFAULT_PHYS_BASE;
+
                 kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
         }
         kvm_apic_set_version(apic->vcpu);
@@ -2364,9 +2371,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
         apic->highest_isr_cache = -1;
         update_divide_count(apic);
         atomic_set(&apic->lapic_timer.pending, 0);
-       if (kvm_vcpu_is_bsp(vcpu))
-               kvm_lapic_set_base(vcpu,
-                               vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP);
+
         vcpu->arch.pv_eoi.msr_val = 0;
         apic_update_ppr(apic);
         if (vcpu->arch.apicv_active) {
@@ -2476,11 +2481,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
                 lapic_timer_advance_dynamic = false;
         }
  
-       /*
-        * APIC is created enabled. This will prevent kvm_lapic_set_base from
-        * thinking that APIC state has changed.
-        */
-       vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
         static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */
         kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
  
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h

index 83e6c69..e9688a9 100644 (file)
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -240,4 +240,29 @@ static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
         return smp_load_acquire(&kvm->arch.memslots_have_rmaps);
  }
  
+static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
+{
+       /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */
+       return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+               (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+}
+
+static inline unsigned long
+__kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, unsigned long npages,
+                     int level)
+{
+       return gfn_to_index(slot->base_gfn + npages - 1,
+                           slot->base_gfn, level) + 1;
+}
+
+static inline unsigned long
+kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, int level)
+{
+       return __kvm_mmu_slot_lpages(slot, slot->npages, level);
+}
+
+static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count)
+{
+       atomic64_add(count, &kvm->stat.pages[level - 1]);
+}
  #endif
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c

index 47b7652..2d7e611 100644 (file)
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -97,6 +97,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
  bool tdp_enabled = false;
  
  static int max_huge_page_level __read_mostly;
+static int tdp_root_level __read_mostly;
  static int max_tdp_level __read_mostly;
  
  enum {
@@ -137,12 +138,22 @@ module_param(dbg, bool, 0644);
  
  #include <trace/events/kvm.h>
  
-/* make pte_list_desc fit well in cache line */
-#define PTE_LIST_EXT 3
+/* make pte_list_desc fit well in cache lines */
+#define PTE_LIST_EXT 14
  
+/*
+ * Slight optimization of cacheline layout, by putting `more' and `spte_count'
+ * at the start; then accessing it will only use one single cacheline for
+ * either full (entries==PTE_LIST_EXT) case or entries<=6.
+ */
  struct pte_list_desc {
-       u64 *sptes[PTE_LIST_EXT];
         struct pte_list_desc *more;
+       /*
+        * Stores number of entries stored in the pte_list_desc.  No need to be
+        * u64 but just for easier alignment.  When PTE_LIST_EXT, means full.
+        */
+       u64 spte_count;
+       u64 *sptes[PTE_LIST_EXT];
  };
  
  struct kvm_shadow_walk_iterator {
@@ -193,7 +204,7 @@ struct kvm_mmu_role_regs {
   * the single source of truth for the MMU's state.
   */
  #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag)                  \
-static inline bool ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\
+static inline bool __maybe_unused ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\
  {                                                                      \
         return !!(regs->reg & flag);                                    \
  }
@@ -215,7 +226,7 @@ BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
   * and the vCPU may be incorrect/irrelevant.
   */
  #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name)                \
-static inline bool is_##reg##_##name(struct kvm_mmu *mmu)      \
+static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu)       \
  {                                                              \
         return !!(mmu->mmu_role. base_or_ext . reg##_##name);   \
  }
@@ -323,12 +334,6 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
  static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
                                    struct x86_exception *exception)
  {
-       /* Check if guest physical address doesn't exceed guest maximum */
-       if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) {
-               exception->error_code |= PFERR_RSVD_MASK;
-               return UNMAPPED_GVA;
-       }
-
          return gpa;
  }
  
@@ -592,12 +597,13 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
   * Rules for using mmu_spte_clear_track_bits:
   * It sets the sptep from present to nonpresent, and track the
   * state bits, it is used to clear the last level sptep.
- * Returns non-zero if the PTE was previously valid.
+ * Returns the old PTE.
   */
-static int mmu_spte_clear_track_bits(u64 *sptep)
+static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
  {
         kvm_pfn_t pfn;
         u64 old_spte = *sptep;
+       int level = sptep_to_sp(sptep)->role.level;
  
         if (!spte_has_volatile_bits(old_spte))
                 __update_clear_spte_fast(sptep, 0ull);
@@ -605,7 +611,9 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
                 old_spte = __update_clear_spte_slow(sptep, 0ull);
  
         if (!is_shadow_present_pte(old_spte))
-               return 0;
+               return old_spte;
+
+       kvm_update_page_stats(kvm, level, -1);
  
         pfn = spte_to_pfn(old_spte);
  
@@ -622,7 +630,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
         if (is_dirty_spte(old_spte))
                 kvm_set_pfn_dirty(pfn);
  
-       return 1;
+       return old_spte;
  }
  
  /*
@@ -686,28 +694,36 @@ static bool mmu_spte_age(u64 *sptep)
  
  static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
  {
-       /*
-        * Prevent page table teardown by making any free-er wait during
-        * kvm_flush_remote_tlbs() IPI to all active vcpus.
-        */
-       local_irq_disable();
+       if (is_tdp_mmu(vcpu->arch.mmu)) {
+               kvm_tdp_mmu_walk_lockless_begin();
+       } else {
+               /*
+                * Prevent page table teardown by making any free-er wait during
+                * kvm_flush_remote_tlbs() IPI to all active vcpus.
+                */
+               local_irq_disable();
  
-       /*
-        * Make sure a following spte read is not reordered ahead of the write
-        * to vcpu->mode.
-        */
-       smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
+               /*
+                * Make sure a following spte read is not reordered ahead of the write
+                * to vcpu->mode.
+                */
+               smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
+       }
  }
  
  static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
  {
-       /*
-        * Make sure the write to vcpu->mode is not reordered in front of
-        * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
-        * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
-        */
-       smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
-       local_irq_enable();
+       if (is_tdp_mmu(vcpu->arch.mmu)) {
+               kvm_tdp_mmu_walk_lockless_end();
+       } else {
+               /*
+                * Make sure the write to vcpu->mode is not reordered in front of
+                * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
+                * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
+                */
+               smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
+               local_irq_enable();
+       }
  }
  
  static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
@@ -786,7 +802,7 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
         return &slot->arch.lpage_info[level - 2][idx];
  }
  
-static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
+static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
                                             gfn_t gfn, int count)
  {
         struct kvm_lpage_info *linfo;
@@ -799,12 +815,12 @@ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
         }
  }
  
-void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
  {
         update_gfn_disallow_lpage_count(slot, gfn, 1);
  }
  
-void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
  {
         update_gfn_disallow_lpage_count(slot, gfn, -1);
  }
@@ -893,7 +909,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
                         struct kvm_rmap_head *rmap_head)
  {
         struct pte_list_desc *desc;
-       int i, count = 0;
+       int count = 0;
  
         if (!rmap_head->val) {
                 rmap_printk("%p %llx 0->1\n", spte, *spte);
@@ -903,24 +919,24 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
                 desc = mmu_alloc_pte_list_desc(vcpu);
                 desc->sptes[0] = (u64 *)rmap_head->val;
                 desc->sptes[1] = spte;
+               desc->spte_count = 2;
                 rmap_head->val = (unsigned long)desc | 1;
                 ++count;
         } else {
                 rmap_printk("%p %llx many->many\n", spte, *spte);
                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
-               while (desc->sptes[PTE_LIST_EXT-1]) {
+               while (desc->spte_count == PTE_LIST_EXT) {
                         count += PTE_LIST_EXT;
-
                         if (!desc->more) {
                                 desc->more = mmu_alloc_pte_list_desc(vcpu);
                                 desc = desc->more;
+                               desc->spte_count = 0;
                                 break;
                         }
                         desc = desc->more;
                 }
-               for (i = 0; desc->sptes[i]; ++i)
-                       ++count;
-               desc->sptes[i] = spte;
+               count += desc->spte_count;
+               desc->sptes[desc->spte_count++] = spte;
         }
         return count;
  }
@@ -930,13 +946,12 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
                            struct pte_list_desc *desc, int i,
                            struct pte_list_desc *prev_desc)
  {
-       int j;
+       int j = desc->spte_count - 1;
  
-       for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
-               ;
         desc->sptes[i] = desc->sptes[j];
         desc->sptes[j] = NULL;
-       if (j != 0)
+       desc->spte_count--;
+       if (desc->spte_count)
                 return;
         if (!prev_desc && !desc->more)
                 rmap_head->val = 0;
@@ -969,7 +984,7 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
                 prev_desc = NULL;
                 while (desc) {
-                       for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
+                       for (i = 0; i < desc->spte_count; ++i) {
                                 if (desc->sptes[i] == spte) {
                                         pte_list_desc_remove_entry(rmap_head,
                                                         desc, i, prev_desc);
@@ -984,30 +999,68 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
         }
  }
  
-static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
+static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+                           u64 *sptep)
  {
-       mmu_spte_clear_track_bits(sptep);
+       mmu_spte_clear_track_bits(kvm, sptep);
         __pte_list_remove(sptep, rmap_head);
  }
  
-static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
-                                          struct kvm_memory_slot *slot)
+/* Return true if rmap existed, false otherwise */
+static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
  {
-       unsigned long idx;
+       struct pte_list_desc *desc, *next;
+       int i;
  
-       idx = gfn_to_index(gfn, slot->base_gfn, level);
-       return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
+       if (!rmap_head->val)
+               return false;
+
+       if (!(rmap_head->val & 1)) {
+               mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
+               goto out;
+       }
+
+       desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+
+       for (; desc; desc = next) {
+               for (i = 0; i < desc->spte_count; i++)
+                       mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
+               next = desc->more;
+               mmu_free_pte_list_desc(desc);
+       }
+out:
+       /* rmap_head is meaningless now, remember to reset it */
+       rmap_head->val = 0;
+       return true;
  }
  
-static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
-                                        struct kvm_mmu_page *sp)
+unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
  {
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *slot;
+       struct pte_list_desc *desc;
+       unsigned int count = 0;
  
-       slots = kvm_memslots_for_spte_role(kvm, sp->role);
-       slot = __gfn_to_memslot(slots, gfn);
-       return __gfn_to_rmap(gfn, sp->role.level, slot);
+       if (!rmap_head->val)
+               return 0;
+       else if (!(rmap_head->val & 1))
+               return 1;
+
+       desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+
+       while (desc) {
+               count += desc->spte_count;
+               desc = desc->more;
+       }
+
+       return count;
+}
+
+static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
+                                        const struct kvm_memory_slot *slot)
+{
+       unsigned long idx;
+
+       idx = gfn_to_index(gfn, slot->base_gfn, level);
+       return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
  }
  
  static bool rmap_can_add(struct kvm_vcpu *vcpu)
@@ -1020,24 +1073,39 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu)
  
  static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
  {
+       struct kvm_memory_slot *slot;
         struct kvm_mmu_page *sp;
         struct kvm_rmap_head *rmap_head;
  
         sp = sptep_to_sp(spte);
         kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
-       rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
+       slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+       rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
         return pte_list_add(vcpu, spte, rmap_head);
  }
  
+
  static void rmap_remove(struct kvm *kvm, u64 *spte)
  {
+       struct kvm_memslots *slots;
+       struct kvm_memory_slot *slot;
         struct kvm_mmu_page *sp;
         gfn_t gfn;
         struct kvm_rmap_head *rmap_head;
  
         sp = sptep_to_sp(spte);
         gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
-       rmap_head = gfn_to_rmap(kvm, gfn, sp);
+
+       /*
+        * Unlike rmap_add and rmap_recycle, rmap_remove does not run in the
+        * context of a vCPU so have to determine which memslots to use based
+        * on context information in sp->role.
+        */
+       slots = kvm_memslots_for_spte_role(kvm, sp->role);
+
+       slot = __gfn_to_memslot(slots, gfn);
+       rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
+
         __pte_list_remove(spte, rmap_head);
  }
  
@@ -1119,7 +1187,9 @@ out:
  
  static void drop_spte(struct kvm *kvm, u64 *sptep)
  {
-       if (mmu_spte_clear_track_bits(sptep))
+       u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);
+
+       if (is_shadow_present_pte(old_spte))
                 rmap_remove(kvm, sptep);
  }
  
@@ -1129,7 +1199,6 @@ static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
         if (is_large_pte(*sptep)) {
                 WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K);
                 drop_spte(kvm, sptep);
-               --kvm->stat.lpages;
                 return true;
         }
  
@@ -1218,7 +1287,7 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep)
   * Returns true iff any D or W bits were cleared.
   */
  static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-                              struct kvm_memory_slot *slot)
+                              const struct kvm_memory_slot *slot)
  {
         u64 *sptep;
         struct rmap_iterator iter;
@@ -1256,8 +1325,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
                 return;
  
         while (mask) {
-               rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
-                                         PG_LEVEL_4K, slot);
+               rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+                                       PG_LEVEL_4K, slot);
                 __rmap_write_protect(kvm, rmap_head, false);
  
                 /* clear the first set bit */
@@ -1289,8 +1358,8 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
                 return;
  
         while (mask) {
-               rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
-                                         PG_LEVEL_4K, slot);
+               rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+                                       PG_LEVEL_4K, slot);
                 __rmap_clear_dirty(kvm, rmap_head, slot);
  
                 /* clear the first set bit */
@@ -1356,7 +1425,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
  
         if (kvm_memslots_have_rmaps(kvm)) {
                 for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
-                       rmap_head = __gfn_to_rmap(gfn, i, slot);
+                       rmap_head = gfn_to_rmap(gfn, i, slot);
                         write_protected |= __rmap_write_protect(kvm, rmap_head, true);
                 }
         }
@@ -1377,20 +1446,9 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
  }
  
  static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-                         struct kvm_memory_slot *slot)
+                         const struct kvm_memory_slot *slot)
  {
-       u64 *sptep;
-       struct rmap_iterator iter;
-       bool flush = false;
-
-       while ((sptep = rmap_get_first(rmap_head, &iter))) {
-               rmap_printk("spte %p %llx.\n", sptep, *sptep);
-
-               pte_list_remove(rmap_head, sptep);
-               flush = true;
-       }
-
-       return flush;
+       return pte_list_destroy(kvm, rmap_head);
  }
  
  static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
@@ -1421,13 +1479,13 @@ restart:
                 need_flush = 1;
  
                 if (pte_write(pte)) {
-                       pte_list_remove(rmap_head, sptep);
+                       pte_list_remove(kvm, rmap_head, sptep);
                         goto restart;
                 } else {
                         new_spte = kvm_mmu_changed_pte_notifier_make_spte(
                                         *sptep, new_pfn);
  
-                       mmu_spte_clear_track_bits(sptep);
+                       mmu_spte_clear_track_bits(kvm, sptep);
                         mmu_spte_set(sptep, new_spte);
                 }
         }
@@ -1442,7 +1500,7 @@ restart:
  
  struct slot_rmap_walk_iterator {
         /* input fields. */
-       struct kvm_memory_slot *slot;
+       const struct kvm_memory_slot *slot;
         gfn_t start_gfn;
         gfn_t end_gfn;
         int start_level;
@@ -1462,14 +1520,13 @@ rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
  {
         iterator->level = level;
         iterator->gfn = iterator->start_gfn;
-       iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
-       iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
-                                          iterator->slot);
+       iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot);
+       iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
  }
  
  static void
  slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
-                   struct kvm_memory_slot *slot, int start_level,
+                   const struct kvm_memory_slot *slot, int start_level,
                     int end_level, gfn_t start_gfn, gfn_t end_gfn)
  {
         iterator->slot = slot;
@@ -1584,12 +1641,13 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
  
  static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
  {
+       struct kvm_memory_slot *slot;
         struct kvm_rmap_head *rmap_head;
         struct kvm_mmu_page *sp;
  
         sp = sptep_to_sp(spte);
-
-       rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
+       slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+       rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
  
         kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
         kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
@@ -2232,8 +2290,6 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
         if (is_shadow_present_pte(pte)) {
                 if (is_last_spte(pte, sp->role.level)) {
                         drop_spte(kvm, spte);
-                       if (is_large_pte(pte))
-                               --kvm->stat.lpages;
                 } else {
                         child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
                         drop_parent_pte(child, spte);
@@ -2716,15 +2772,12 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
  
         pgprintk("%s: setting spte %llx\n", __func__, *sptep);
         trace_kvm_mmu_set_spte(level, gfn, sptep);
-       if (!was_rmapped && is_large_pte(*sptep))
-               ++vcpu->kvm->stat.lpages;
  
-       if (is_shadow_present_pte(*sptep)) {
-               if (!was_rmapped) {
-                       rmap_count = rmap_add(vcpu, sptep, gfn);
-                       if (rmap_count > RMAP_RECYCLE_THRESHOLD)
-                               rmap_recycle(vcpu, sptep, gfn);
-               }
+       if (!was_rmapped) {
+               kvm_update_page_stats(vcpu->kvm, level, 1);
+               rmap_count = rmap_add(vcpu, sptep, gfn);
+               if (rmap_count > RMAP_RECYCLE_THRESHOLD)
+                       rmap_recycle(vcpu, sptep, gfn);
         }
  
         return ret;
@@ -2852,6 +2905,7 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
                               kvm_pfn_t pfn, int max_level)
  {
         struct kvm_lpage_info *linfo;
+       int host_level;
  
         max_level = min(max_level, max_huge_page_level);
         for ( ; max_level > PG_LEVEL_4K; max_level--) {
@@ -2863,7 +2917,8 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
         if (max_level == PG_LEVEL_4K)
                 return PG_LEVEL_4K;
  
-       return host_pfn_mapping_level(kvm, gfn, pfn, slot);
+       host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
+       return min(host_level, max_level);
  }
  
  int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
@@ -2887,17 +2942,12 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
         if (!slot)
                 return PG_LEVEL_4K;
  
-       level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level);
-       if (level == PG_LEVEL_4K)
-               return level;
-
-       *req_level = level = min(level, max_level);
-
         /*
          * Enforce the iTLB multihit workaround after capturing the requested
          * level, which will be used to do precise, accurate accounting.
          */
-       if (huge_page_disallowed)
+       *req_level = level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level);
+       if (level == PG_LEVEL_4K || huge_page_disallowed)
                 return PG_LEVEL_4K;
  
         /*
@@ -2965,15 +3015,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
                         break;
  
                 drop_large_spte(vcpu, it.sptep);
-               if (!is_shadow_present_pte(*it.sptep)) {
-                       sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
-                                             it.level - 1, true, ACC_ALL);
-
-                       link_shadow_page(vcpu, it.sptep, sp);
-                       if (is_tdp && huge_page_disallowed &&
-                           req_level >= it.level)
-                               account_huge_nx_page(vcpu->kvm, sp);
-               }
+               if (is_shadow_present_pte(*it.sptep))
+                       continue;
+
+               sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
+                                     it.level - 1, true, ACC_ALL);
+
+               link_shadow_page(vcpu, it.sptep, sp);
+               if (is_tdp && huge_page_disallowed &&
+                   req_level >= it.level)
+                       account_huge_nx_page(vcpu->kvm, sp);
         }
  
         ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
@@ -3122,15 +3173,40 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte)
  }
  
  /*
- * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
+ * Returns the last level spte pointer of the shadow page walk for the given
+ * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
+ * walk could be performed, returns NULL and *spte does not contain valid data.
+ *
+ * Contract:
+ *  - Must be called between walk_shadow_page_lockless_{begin,end}.
+ *  - The returned sptep must not be used after walk_shadow_page_lockless_end.
   */
-static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
-                          u32 error_code)
+static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
  {
         struct kvm_shadow_walk_iterator iterator;
+       u64 old_spte;
+       u64 *sptep = NULL;
+
+       for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
+               sptep = iterator.sptep;
+               *spte = old_spte;
+
+               if (!is_shadow_present_pte(old_spte))
+                       break;
+       }
+
+       return sptep;
+}
+
+/*
+ * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
+ */
+static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code)
+{
         struct kvm_mmu_page *sp;
         int ret = RET_PF_INVALID;
         u64 spte = 0ull;
+       u64 *sptep = NULL;
         uint retry_count = 0;
  
         if (!page_fault_can_be_fast(error_code))
@@ -3141,14 +3217,15 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
         do {
                 u64 new_spte;
  
-               for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
-                       if (!is_shadow_present_pte(spte))
-                               break;
+               if (is_tdp_mmu(vcpu->arch.mmu))
+                       sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, gpa, &spte);
+               else
+                       sptep = fast_pf_get_last_sptep(vcpu, gpa, &spte);
  
                 if (!is_shadow_present_pte(spte))
                         break;
  
-               sp = sptep_to_sp(iterator.sptep);
+               sp = sptep_to_sp(sptep);
                 if (!is_last_spte(spte, sp->role.level))
                         break;
  
@@ -3206,8 +3283,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                  * since the gfn is not stable for indirect shadow page. See
                  * Documentation/virt/kvm/locking.rst to get more detail.
                  */
-               if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte,
-                                           new_spte)) {
+               if (fast_pf_fix_direct_spte(vcpu, sp, sptep, spte, new_spte)) {
                         ret = RET_PF_FIXED;
                         break;
                 }
@@ -3220,8 +3296,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
  
         } while (true);
  
-       trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
-                             spte, ret);
+       trace_fast_page_fault(vcpu, gpa, error_code, sptep, spte, ret);
         walk_shadow_page_lockless_end(vcpu);
  
         return ret;
@@ -3455,15 +3530,22 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
          * the shadow page table may be a PAE or a long mode page table.
          */
         pm_mask = PT_PRESENT_MASK | shadow_me_mask;
-       if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
+       if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
                 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
  
                 if (WARN_ON_ONCE(!mmu->pml4_root)) {
                         r = -EIO;
                         goto out_unlock;
                 }
-
                 mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
+
+               if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) {
+                       if (WARN_ON_ONCE(!mmu->pml5_root)) {
+                               r = -EIO;
+                               goto out_unlock;
+                       }
+                       mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
+               }
         }
  
         for (i = 0; i < 4; ++i) {
@@ -3482,7 +3564,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
                 mmu->pae_root[i] = root | pm_mask;
         }
  
-       if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+       if (mmu->shadow_root_level == PT64_ROOT_5LEVEL)
+               mmu->root_hpa = __pa(mmu->pml5_root);
+       else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
                 mmu->root_hpa = __pa(mmu->pml4_root);
         else
                 mmu->root_hpa = __pa(mmu->pae_root);
@@ -3498,7 +3582,10 @@ out_unlock:
  static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
  {
         struct kvm_mmu *mmu = vcpu->arch.mmu;
-       u64 *pml4_root, *pae_root;
+       bool need_pml5 = mmu->shadow_root_level > PT64_ROOT_4LEVEL;
+       u64 *pml5_root = NULL;
+       u64 *pml4_root = NULL;
+       u64 *pae_root;
  
         /*
          * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
@@ -3511,20 +3598,21 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
                 return 0;
  
         /*
-        * This mess only works with 4-level paging and needs to be updated to
-        * work with 5-level paging.
+        * NPT, the only paging mode that uses this horror, uses a fixed number
+        * of levels for the shadow page tables, e.g. all MMUs are 4-level or
+        * all MMus are 5-level.  Thus, this can safely require that pml5_root
+        * is allocated if the other roots are valid and pml5 is needed, as any
+        * prior MMU would also have required pml5.
          */
-       if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
-               return -EIO;
-
-       if (mmu->pae_root && mmu->pml4_root)
+       if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
                 return 0;
  
         /*
          * The special roots should always be allocated in concert.  Yell and
          * bail if KVM ends up in a state where only one of the roots is valid.
          */
-       if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root))
+       if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
+                        (need_pml5 && mmu->pml5_root)))
                 return -EIO;
  
         /*
@@ -3535,16 +3623,31 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
         if (!pae_root)
                 return -ENOMEM;
  
+#ifdef CONFIG_X86_64
         pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
-       if (!pml4_root) {
-               free_page((unsigned long)pae_root);
-               return -ENOMEM;
+       if (!pml4_root)
+               goto err_pml4;
+
+       if (need_pml5) {
+               pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+               if (!pml5_root)
+                       goto err_pml5;
         }
+#endif
  
         mmu->pae_root = pae_root;
         mmu->pml4_root = pml4_root;
+       mmu->pml5_root = pml5_root;
  
         return 0;
+
+#ifdef CONFIG_X86_64
+err_pml5:
+       free_page((unsigned long)pml4_root);
+err_pml4:
+       free_page((unsigned long)pae_root);
+       return -ENOMEM;
+#endif
  }
  
  void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -3640,6 +3743,8 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
  /*
   * Return the level of the lowest level SPTE added to sptes.
   * That SPTE may be non-present.
+ *
+ * Must be called between walk_shadow_page_lockless_{begin,end}.
   */
  static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
  {
@@ -3647,8 +3752,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level
         int leaf = -1;
         u64 spte;
  
-       walk_shadow_page_lockless_begin(vcpu);
-
         for (shadow_walk_init(&iterator, vcpu, addr),
              *root_level = iterator.level;
              shadow_walk_okay(&iterator);
@@ -3662,8 +3765,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level
                         break;
         }
  
-       walk_shadow_page_lockless_end(vcpu);
-
         return leaf;
  }
  
@@ -3675,11 +3776,15 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
         int root, leaf, level;
         bool reserved = false;
  
+       walk_shadow_page_lockless_begin(vcpu);
+
         if (is_tdp_mmu(vcpu->arch.mmu))
                 leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
         else
                 leaf = get_walk(vcpu, addr, sptes, &root);
  
+       walk_shadow_page_lockless_end(vcpu);
+
         if (unlikely(leaf < 0)) {
                 *sptep = 0ull;
                 return reserved;
@@ -3795,9 +3900,9 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                                   kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
  }
  
-static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
+static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
                          gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
-                        bool write, bool *writable)
+                        bool write, bool *writable, int *r)
  {
         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
         bool async;
@@ -3808,13 +3913,26 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
          * be zapped before KVM inserts a new MMIO SPTE for the gfn.
          */
         if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
-               return true;
-
-       /* Don't expose private memslots to L2. */
-       if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
-               *pfn = KVM_PFN_NOSLOT;
-               *writable = false;
-               return false;
+               goto out_retry;
+
+       if (!kvm_is_visible_memslot(slot)) {
+               /* Don't expose private memslots to L2. */
+               if (is_guest_mode(vcpu)) {
+                       *pfn = KVM_PFN_NOSLOT;
+                       *writable = false;
+                       return false;
+               }
+               /*
+                * If the APIC access page exists but is disabled, go directly
+                * to emulation without caching the MMIO access or creating a
+                * MMIO SPTE.  That way the cache doesn't need to be purged
+                * when the AVIC is re-enabled.
+                */
+               if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
+                   !kvm_apicv_activated(vcpu->kvm)) {
+                       *r = RET_PF_EMULATE;
+                       return true;
+               }
         }
  
         async = false;
@@ -3828,14 +3946,17 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
                 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
                         trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
                         kvm_make_request(KVM_REQ_APF_HALT, vcpu);
-                       return true;
+                       goto out_retry;
                 } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
-                       return true;
+                       goto out_retry;
         }
  
         *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
                                     write, writable, hva);
-       return false;
+
+out_retry:
+       *r = RET_PF_RETRY;
+       return true;
  }
  
  static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
@@ -3854,11 +3975,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
         if (page_fault_handle_page_track(vcpu, error_code, gfn))
                 return RET_PF_EMULATE;
  
-       if (!is_tdp_mmu_fault) {
-               r = fast_page_fault(vcpu, gpa, error_code);
-               if (r != RET_PF_INVALID)
-                       return r;
-       }
+       r = fast_page_fault(vcpu, gpa, error_code);
+       if (r != RET_PF_INVALID)
+               return r;
  
         r = mmu_topup_memory_caches(vcpu, false);
         if (r)
@@ -3867,9 +3986,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
         mmu_seq = vcpu->kvm->mmu_notifier_seq;
         smp_rmb();
  
-       if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
-                        write, &map_writable))
-               return RET_PF_RETRY;
+       if (kvm_faultin_pfn(vcpu, prefault, gfn, gpa, &pfn, &hva,
+                        write, &map_writable, &r))
+               return r;
  
         if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
                 return r;
@@ -4588,6 +4707,10 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
  
  static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
  {
+       /* tdp_root_level is architecture forced level, use it if nonzero */
+       if (tdp_root_level)
+               return tdp_root_level;
+
         /* Use 5-level TDP if and only if it's useful/necessary. */
         if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
                 return 4;
@@ -5160,7 +5283,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
         if (r == RET_PF_INVALID) {
                 r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
                                           lower_32_bits(error_code), false);
-               if (WARN_ON_ONCE(r == RET_PF_INVALID))
+               if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
                         return -EIO;
         }
  
@@ -5279,10 +5402,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
          */
  }
  
-void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
-                      int tdp_huge_page_level)
+void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
+                      int tdp_max_root_level, int tdp_huge_page_level)
  {
         tdp_enabled = enable_tdp;
+       tdp_root_level = tdp_forced_root_level;
         max_tdp_level = tdp_max_root_level;
  
         /*
@@ -5302,12 +5426,13 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
  EXPORT_SYMBOL_GPL(kvm_configure_mmu);
  
  /* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-                                   struct kvm_memory_slot *slot);
+typedef bool (*slot_level_handler) (struct kvm *kvm,
+                                   struct kvm_rmap_head *rmap_head,
+                                   const struct kvm_memory_slot *slot);
  
  /* The caller should hold mmu-lock before calling this function. */
  static __always_inline bool
-slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
+slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
                         slot_level_handler fn, int start_level, int end_level,
                         gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
                         bool flush)
@@ -5334,7 +5459,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
  }
  
  static __always_inline bool
-slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot,
                   slot_level_handler fn, int start_level, int end_level,
                   bool flush_on_yield)
  {
@@ -5345,7 +5470,7 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
  }
  
  static __always_inline bool
-slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
+slot_handle_leaf(struct kvm *kvm, const struct kvm_memory_slot *memslot,
                  slot_level_handler fn, bool flush_on_yield)
  {
         return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
@@ -5358,6 +5483,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
                 set_memory_encrypted((unsigned long)mmu->pae_root, 1);
         free_page((unsigned long)mmu->pae_root);
         free_page((unsigned long)mmu->pml4_root);
+       free_page((unsigned long)mmu->pml5_root);
  }
  
  static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
@@ -5587,6 +5713,10 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
         kvm_mmu_uninit_tdp_mmu(kvm);
  }
  
+/*
+ * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
+ * (not including it)
+ */
  void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
  {
         struct kvm_memslots *slots;
@@ -5594,8 +5724,11 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
         int i;
         bool flush = false;
  
+       write_lock(&kvm->mmu_lock);
+
+       kvm_inc_notifier_count(kvm, gfn_start, gfn_end);
+
         if (kvm_memslots_have_rmaps(kvm)) {
-               write_lock(&kvm->mmu_lock);
                 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
                         slots = __kvm_memslots(kvm, i);
                         kvm_for_each_memslot(memslot, slots) {
@@ -5606,41 +5739,44 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
                                 if (start >= end)
                                         continue;
  
-                               flush = slot_handle_level_range(kvm, memslot,
+                               flush = slot_handle_level_range(kvm,
+                                               (const struct kvm_memory_slot *) memslot,
                                                 kvm_zap_rmapp, PG_LEVEL_4K,
                                                 KVM_MAX_HUGEPAGE_LEVEL, start,
                                                 end - 1, true, flush);
                         }
                 }
                 if (flush)
-                       kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
-               write_unlock(&kvm->mmu_lock);
+                       kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
+                                                          gfn_end - gfn_start);
         }
  
         if (is_tdp_mmu_enabled(kvm)) {
-               flush = false;
-
-               read_lock(&kvm->mmu_lock);
                 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
                         flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
-                                                         gfn_end, flush, true);
+                                                         gfn_end, flush);
                 if (flush)
                         kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
-                                                          gfn_end);
-
-               read_unlock(&kvm->mmu_lock);
+                                                          gfn_end - gfn_start);
         }
+
+       if (flush)
+               kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
+
+       kvm_dec_notifier_count(kvm, gfn_start, gfn_end);
+
+       write_unlock(&kvm->mmu_lock);
  }
  
  static bool slot_rmap_write_protect(struct kvm *kvm,
                                     struct kvm_rmap_head *rmap_head,
-                                   struct kvm_memory_slot *slot)
+                                   const struct kvm_memory_slot *slot)
  {
         return __rmap_write_protect(kvm, rmap_head, false);
  }
  
  void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
-                                     struct kvm_memory_slot *memslot,
+                                     const struct kvm_memory_slot *memslot,
                                       int start_level)
  {
         bool flush = false;
@@ -5676,7 +5812,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
  
  static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
                                          struct kvm_rmap_head *rmap_head,
-                                        struct kvm_memory_slot *slot)
+                                        const struct kvm_memory_slot *slot)
  {
         u64 *sptep;
         struct rmap_iterator iter;
@@ -5699,7 +5835,7 @@ restart:
                 if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
                     sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
                                                                pfn, PG_LEVEL_NUM)) {
-                       pte_list_remove(rmap_head, sptep);
+                       pte_list_remove(kvm, rmap_head, sptep);
  
                         if (kvm_available_flush_tlb_with_range())
                                 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
@@ -5715,10 +5851,8 @@ restart:
  }
  
  void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
-                                  const struct kvm_memory_slot *memslot)
+                                  const struct kvm_memory_slot *slot)
  {
-       /* FIXME: const-ify all uses of struct kvm_memory_slot.  */
-       struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot;
         bool flush = false;
  
         if (kvm_memslots_have_rmaps(kvm)) {
@@ -5754,7 +5888,7 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
  }
  
  void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
-                                  struct kvm_memory_slot *memslot)
+                                  const struct kvm_memory_slot *memslot)
  {
         bool flush = false;
  
diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c

index cedc17b..9e7dcf9 100644 (file)
--- a/arch/x86/kvm/mmu/mmu_audit.c
+++ b/arch/x86/kvm/mmu/mmu_audit.c
@@ -147,7 +147,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
                 return;
         }
  
-       rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot);
+       rmap_head = gfn_to_rmap(gfn, rev_sp->role.level, slot);
         if (!rmap_head->val) {
                 if (!__ratelimit(&ratelimit_state))
                         return;
@@ -200,7 +200,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
  
         slots = kvm_memslots_for_spte_role(kvm, sp->role);
         slot = __gfn_to_memslot(slots, sp->gfn);
-       rmap_head = __gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot);
+       rmap_head = gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot);
  
         for_each_rmap_spte(rmap_head, &iter, sptep) {
                 if (is_writable_pte(*sptep))
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h

index 3556729..bf2bdbf 100644 (file)
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -31,13 +31,16 @@ extern bool dbg;
  #define IS_VALID_PAE_ROOT(x)   (!!(x))
  
  struct kvm_mmu_page {
+       /*
+        * Note, "link" through "spt" fit in a single 64 byte cache line on
+        * 64-bit kernels, keep it that way unless there's a reason not to.
+        */
         struct list_head link;
         struct hlist_node hash_link;
-       struct list_head lpage_disallowed_link;
  
+       bool tdp_mmu_page;
         bool unsync;
         u8 mmu_valid_gen;
-       bool mmio_cached;
         bool lpage_disallowed; /* Can't be replaced by an equiv large page */
  
         /*
@@ -59,6 +62,7 @@ struct kvm_mmu_page {
         struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
         DECLARE_BITMAP(unsync_child_bitmap, 512);
  
+       struct list_head lpage_disallowed_link;
  #ifdef CONFIG_X86_32
         /*
          * Used out of the mmu-lock to avoid reading spte values while an
@@ -71,8 +75,6 @@ struct kvm_mmu_page {
         atomic_t write_flooding_count;
  
  #ifdef CONFIG_X86_64
-       bool tdp_mmu_page;
-
         /* Used for freeing the page asynchronously if it is a TDP MMU page. */
         struct rcu_head rcu_head;
  #endif
@@ -124,13 +126,14 @@ static inline bool is_nx_huge_page_enabled(void)
  
  int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync);
  
-void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
-void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
  bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
                                     struct kvm_memory_slot *slot, u64 gfn,
                                     int min_level);
  void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
                                         u64 start_gfn, u64 pages);
+unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
  
  /*
   * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault().
@@ -140,6 +143,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
   * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
   * RET_PF_FIXED: The faulting entry has been fixed.
   * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
+ *
+ * Any names added to this enum should be exported to userspace for use in
+ * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h
   */
  enum {
         RET_PF_RETRY = 0,
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h

index efbad33..2924a40 100644 (file)
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -54,6 +54,12 @@
         { PFERR_RSVD_MASK, "RSVD" },    \
         { PFERR_FETCH_MASK, "F" }
  
+TRACE_DEFINE_ENUM(RET_PF_RETRY);
+TRACE_DEFINE_ENUM(RET_PF_EMULATE);
+TRACE_DEFINE_ENUM(RET_PF_INVALID);
+TRACE_DEFINE_ENUM(RET_PF_FIXED);
+TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);
+
  /*
   * A pagetable walk has started
   */
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c

index 91a9f7e..269f11f 100644 (file)
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -16,6 +16,7 @@
  
  #include <asm/kvm_page_track.h>
  
+#include "mmu.h"
  #include "mmu_internal.h"
  
  void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h

index ee044d3..7d03e9b 100644 (file)
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -881,9 +881,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
         mmu_seq = vcpu->kvm->mmu_notifier_seq;
         smp_rmb();
  
-       if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
-                        write_fault, &map_writable))
-               return RET_PF_RETRY;
+       if (kvm_faultin_pfn(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
+                        write_fault, &map_writable, &r))
+               return r;
  
         if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
                 return r;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c

index d80cb12..64ccfc1 100644 (file)
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -10,7 +10,7 @@
  #include <asm/cmpxchg.h>
  #include <trace/events/kvm.h>
  
-static bool __read_mostly tdp_mmu_enabled = false;
+static bool __read_mostly tdp_mmu_enabled = true;
  module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
  
  /* Initializes the TDP MMU for the VM, if enabled. */
@@ -255,26 +255,17 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
   *
   * @kvm: kvm instance
   * @sp: the new page
- * @shared: This operation may not be running under the exclusive use of
- *         the MMU lock and the operation must synchronize with other
- *         threads that might be adding or removing pages.
   * @account_nx: This page replaces a NX large page and should be marked for
   *             eventual reclaim.
   */
  static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
-                             bool shared, bool account_nx)
+                             bool account_nx)
  {
-       if (shared)
-               spin_lock(&kvm->arch.tdp_mmu_pages_lock);
-       else
-               lockdep_assert_held_write(&kvm->mmu_lock);
-
+       spin_lock(&kvm->arch.tdp_mmu_pages_lock);
         list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
         if (account_nx)
                 account_huge_nx_page(kvm, sp);
-
-       if (shared)
-               spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+       spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
  }
  
  /**
@@ -445,13 +436,6 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
  
         trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
  
-       if (is_large_pte(old_spte) != is_large_pte(new_spte)) {
-               if (is_large_pte(old_spte))
-                       atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages);
-               else
-                       atomic64_add(1, (atomic64_t*)&kvm->stat.lpages);
-       }
-
         /*
          * The only times a SPTE should be changed from a non-present to
          * non-present state is when an MMIO entry is installed/modified/
@@ -477,6 +461,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
                 return;
         }
  
+       if (is_leaf != was_leaf)
+               kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
  
         if (was_leaf && is_dirty_spte(old_spte) &&
             (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
@@ -526,6 +512,10 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
         if (is_removed_spte(iter->old_spte))
                 return false;
  
+       /*
+        * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
+        * does not hold the mmu_lock.
+        */
         if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
                       new_spte) != iter->old_spte)
                 return false;
@@ -537,15 +527,40 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
         return true;
  }
  
-static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
-                                          struct tdp_iter *iter,
-                                          u64 new_spte)
+/*
+ * tdp_mmu_map_set_spte_atomic - Set a leaf TDP MMU SPTE atomically to resolve a
+ * TDP page fault.
+ *
+ * @vcpu: The vcpu instance that took the TDP page fault.
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @new_spte: The value the SPTE should be set to
+ *
+ * Returns: true if the SPTE was set, false if it was not. If false is returned,
+ *         this function will have no side-effects.
+ */
+static inline bool tdp_mmu_map_set_spte_atomic(struct kvm_vcpu *vcpu,
+                                              struct tdp_iter *iter,
+                                              u64 new_spte)
  {
+       struct kvm *kvm = vcpu->kvm;
+
         if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
                 return false;
  
-       handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
-                                     iter->old_spte, new_spte, iter->level);
+       /*
+        * Use kvm_vcpu_gfn_to_memslot() instead of going through
+        * handle_changed_spte_dirty_log() to leverage vcpu->last_used_slot.
+        */
+       if (is_writable_pte(new_spte)) {
+               struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, iter->gfn);
+
+               if (slot && kvm_slot_dirty_track_enabled(slot)) {
+                       /* Enforced by kvm_mmu_hugepage_adjust. */
+                       WARN_ON_ONCE(iter->level > PG_LEVEL_4K);
+                       mark_page_dirty_in_slot(kvm, slot, iter->gfn);
+               }
+       }
+
         return true;
  }
  
@@ -558,7 +573,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
          * immediately installing a present entry in its place
          * before the TLBs are flushed.
          */
-       if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
+       if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, REMOVED_SPTE))
                 return false;
  
         kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
@@ -789,21 +804,15 @@ retry:
   * non-root pages mapping GFNs strictly within that range. Returns true if
   * SPTEs have been cleared and a TLB flush is needed before releasing the
   * MMU lock.
- *
- * If shared is true, this thread holds the MMU lock in read mode and must
- * account for the possibility that other threads are modifying the paging
- * structures concurrently. If shared is false, this thread should hold the
- * MMU in write mode.
   */
  bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
-                                gfn_t end, bool can_yield, bool flush,
-                                bool shared)
+                                gfn_t end, bool can_yield, bool flush)
  {
         struct kvm_mmu_page *root;
  
-       for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
+       for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
                 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
-                                     shared);
+                                     false);
  
         return flush;
  }
@@ -814,8 +823,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
         int i;
  
         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
-               flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull,
-                                                 flush, false);
+               flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
  
         if (flush)
                 kvm_flush_remote_tlbs(kvm);
@@ -940,7 +948,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
  
         if (new_spte == iter->old_spte)
                 ret = RET_PF_SPURIOUS;
-       else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
+       else if (!tdp_mmu_map_set_spte_atomic(vcpu, iter, new_spte))
                 return RET_PF_RETRY;
  
         /*
@@ -1044,9 +1052,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
                         new_spte = make_nonleaf_spte(child_pt,
                                                      !shadow_accessed_mask);
  
-                       if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
-                                                   new_spte)) {
-                               tdp_mmu_link_page(vcpu->kvm, sp, true,
+                       if (tdp_mmu_set_spte_atomic_no_dirty_log(vcpu->kvm, &iter, new_spte)) {
+                               tdp_mmu_link_page(vcpu->kvm, sp,
                                                   huge_page_disallowed &&
                                                   req_level >= iter.level);
  
@@ -1255,8 +1262,8 @@ retry:
   * only affect leaf SPTEs down to min_level.
   * Returns true if an SPTE has been changed and the TLBs need to be flushed.
   */
-bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
-                            int min_level)
+bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
+                            const struct kvm_memory_slot *slot, int min_level)
  {
         struct kvm_mmu_page *root;
         bool spte_set = false;
@@ -1326,7 +1333,8 @@ retry:
   * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
   * be flushed.
   */
-bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
+bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
+                                 const struct kvm_memory_slot *slot)
  {
         struct kvm_mmu_page *root;
         bool spte_set = false;
@@ -1529,6 +1537,8 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
  /*
   * Return the level of the lowest level SPTE added to sptes.
   * That SPTE may be non-present.
+ *
+ * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
   */
  int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
                          int *root_level)
@@ -1540,14 +1550,47 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
  
         *root_level = vcpu->arch.mmu->shadow_root_level;
  
-       rcu_read_lock();
-
         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
                 leaf = iter.level;
                 sptes[leaf] = iter.old_spte;
         }
  
-       rcu_read_unlock();
-
         return leaf;
  }
+
+/*
+ * Returns the last level spte pointer of the shadow page walk for the given
+ * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
+ * walk could be performed, returns NULL and *spte does not contain valid data.
+ *
+ * Contract:
+ *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
+ *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
+ *
+ * WARNING: This function is only intended to be called during fast_page_fault.
+ */
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
+                                       u64 *spte)
+{
+       struct tdp_iter iter;
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
+       gfn_t gfn = addr >> PAGE_SHIFT;
+       tdp_ptep_t sptep = NULL;
+
+       tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
+               *spte = iter.old_spte;
+               sptep = iter.sptep;
+       }
+
+       /*
+        * Perform the rcu_dereference to get the raw spte pointer value since
+        * we are passing it up to fast_page_fault, which is shared with the
+        * legacy MMU and thus does not retain the TDP MMU-specific __rcu
+        * annotation.
+        *
+        * This is safe since fast_page_fault obeys the contracts of this
+        * function as well as all TDP MMU contracts around modifying SPTEs
+        * outside of mmu_lock.
+        */
+       return rcu_dereference(sptep);
+}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h

index 1cae448..358f447 100644 (file)
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -20,14 +20,11 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
                           bool shared);
  
  bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
-                                gfn_t end, bool can_yield, bool flush,
-                                bool shared);
+                                gfn_t end, bool can_yield, bool flush);
  static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
-                                            gfn_t start, gfn_t end, bool flush,
-                                            bool shared)
+                                            gfn_t start, gfn_t end, bool flush)
  {
-       return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush,
-                                          shared);
+       return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush);
  }
  static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
  {
@@ -44,7 +41,7 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
          */
         lockdep_assert_held_write(&kvm->mmu_lock);
         return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp),
-                                          sp->gfn, end, false, false, false);
+                                          sp->gfn, end, false, false);
  }
  
  void kvm_tdp_mmu_zap_all(struct kvm *kvm);
@@ -61,10 +58,10 @@ bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
  bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
  bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
  
-bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
-                            int min_level);
+bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
+                            const struct kvm_memory_slot *slot, int min_level);
  bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
-                                 struct kvm_memory_slot *slot);
+                                 const struct kvm_memory_slot *slot);
  void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
                                        struct kvm_memory_slot *slot,
                                        gfn_t gfn, unsigned long mask,
@@ -77,8 +74,20 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
                                    struct kvm_memory_slot *slot, gfn_t gfn,
                                    int min_level);
  
+static inline void kvm_tdp_mmu_walk_lockless_begin(void)
+{
+       rcu_read_lock();
+}
+
+static inline void kvm_tdp_mmu_walk_lockless_end(void)
+{
+       rcu_read_unlock();
+}
+
  int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
                          int *root_level);
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
+                                       u64 *spte);
  
  #ifdef CONFIG_X86_64
  bool kvm_mmu_init_tdp_mmu(struct kvm *kvm);
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c

index 827886c..0772bad 100644 (file)
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -137,18 +137,20 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
         pmc->perf_event = event;
         pmc_to_pmu(pmc)->event_count++;
         clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
+       pmc->is_paused = false;
  }
  
  static void pmc_pause_counter(struct kvm_pmc *pmc)
  {
         u64 counter = pmc->counter;
  
-       if (!pmc->perf_event)
+       if (!pmc->perf_event || pmc->is_paused)
                 return;
  
         /* update counter, reset event value to avoid redundant accumulation */
         counter += perf_event_pause(pmc->perf_event, true);
         pmc->counter = counter & pmc_bitmask(pmc);
+       pmc->is_paused = true;
  }
  
  static bool pmc_resume_counter(struct kvm_pmc *pmc)
@@ -163,6 +165,7 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
  
         /* reuse perf_event to serve as pmc_reprogram_counter() does*/
         perf_event_enable(pmc->perf_event);
+       pmc->is_paused = false;
  
         clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
         return true;
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h

index 67e753e..0e4f2b1 100644 (file)
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -55,7 +55,7 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
         u64 counter, enabled, running;
  
         counter = pmc->counter;
-       if (pmc->perf_event)
+       if (pmc->perf_event && !pmc->is_paused)
                 counter += perf_event_read_value(pmc->perf_event,
                                                  &enabled, &running);
         /* FIXME: Scaling needed? */
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c

index a8ad78a..8052d92 100644 (file)
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -197,6 +197,8 @@ void avic_init_vmcb(struct vcpu_svm *svm)
         vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
         vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
         vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
+       vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
+
         if (kvm_apicv_activated(svm->vcpu.kvm))
                 vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
         else
@@ -225,31 +227,26 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
   * field of the VMCB. Therefore, we set up the
   * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
   */
-static int avic_update_access_page(struct kvm *kvm, bool activate)
+static int avic_alloc_access_page(struct kvm *kvm)
  {
         void __user *ret;
         int r = 0;
  
         mutex_lock(&kvm->slots_lock);
-       /*
-        * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger
-        * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT
-        * memory region. So, we need to ensure that kvm->mm == current->mm.
-        */
-       if ((kvm->arch.apic_access_memslot_enabled == activate) ||
-           (kvm->mm != current->mm))
+
+       if (kvm->arch.apic_access_memslot_enabled)
                 goto out;
  
         ret = __x86_set_memory_region(kvm,
                                       APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
                                       APIC_DEFAULT_PHYS_BASE,
-                                     activate ? PAGE_SIZE : 0);
+                                     PAGE_SIZE);
         if (IS_ERR(ret)) {
                 r = PTR_ERR(ret);
                 goto out;
         }
  
-       kvm->arch.apic_access_memslot_enabled = activate;
+       kvm->arch.apic_access_memslot_enabled = true;
  out:
         mutex_unlock(&kvm->slots_lock);
         return r;
@@ -270,7 +267,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
         if (kvm_apicv_activated(vcpu->kvm)) {
                 int ret;
  
-               ret = avic_update_access_page(vcpu->kvm, true);
+               ret = avic_alloc_access_page(vcpu->kvm);
                 if (ret)
                         return ret;
         }
@@ -587,17 +584,6 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu)
         avic_handle_ldr_update(vcpu);
  }
  
-void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate)
-{
-       if (!enable_apicv || !lapic_in_kernel(vcpu))
-               return;
-
-       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-       kvm_request_apicv_update(vcpu->kvm, activate,
-                                APICV_INHIBIT_REASON_IRQWIN);
-       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-}
-
  void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
  {
         return;
@@ -667,6 +653,11 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
         }
         vmcb_mark_dirty(vmcb, VMCB_AVIC);
  
+       if (activated)
+               avic_vcpu_load(vcpu, vcpu->cpu);
+       else
+               avic_vcpu_put(vcpu);
+
         svm_set_pi_irte_mode(vcpu, activated);
  }
  
@@ -918,10 +909,6 @@ bool svm_check_apicv_inhibit_reasons(ulong bit)
         return supported & BIT(bit);
  }
  
-void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate)
-{
-       avic_update_access_page(kvm, activate);
-}
  
  static inline int
  avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
@@ -960,9 +947,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
         int h_physical_id = kvm_cpu_get_apicid(cpu);
         struct vcpu_svm *svm = to_svm(vcpu);
  
-       if (!kvm_vcpu_apicv_active(vcpu))
-               return;
-
         /*
          * Since the host physical APIC id is 8 bits,
          * we can support host APIC ID upto 255.
@@ -990,9 +974,6 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
         u64 entry;
         struct vcpu_svm *svm = to_svm(vcpu);
  
-       if (!kvm_vcpu_apicv_active(vcpu))
-               return;
-
         entry = READ_ONCE(*(svm->avic_physical_id_cache));
         if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
                 avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
@@ -1009,6 +990,10 @@ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
         struct vcpu_svm *svm = to_svm(vcpu);
  
         svm->avic_is_running = is_run;
+
+       if (!kvm_vcpu_apicv_active(vcpu))
+               return;
+
         if (is_run)
                 avic_vcpu_load(vcpu, vcpu->cpu);
         else
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c

index e551547..2545d0c 100644 (file)
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -666,11 +666,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
                 goto out;
         }
  
-
-       /* Clear internal status */
-       kvm_clear_exception_queue(vcpu);
-       kvm_clear_interrupt_queue(vcpu);
-
         /*
          * Since vmcb01 is not in use, we can use it to store some of the L1
          * state.
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c

index 7fbce34..75e0b21 100644 (file)
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -28,8 +28,6 @@
  #include "cpuid.h"
  #include "trace.h"
  
-#define __ex(x) __kvm_handle_fault_on_reboot(x)
-
  #ifndef CONFIG_KVM_AMD_SEV
  /*
   * When this config is not defined, SEV feature is not supported and APIs in
@@ -584,6 +582,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
         save->xcr0 = svm->vcpu.arch.xcr0;
         save->pkru = svm->vcpu.arch.pkru;
         save->xss  = svm->vcpu.arch.ia32_xss;
+       save->dr6  = svm->vcpu.arch.dr6;
  
         /*
          * SEV-ES will use a VMSA that is pointed to by the VMCB, not
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c

index 69639f9..05e8d4d 100644 (file)
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -46,8 +46,6 @@
  #include "kvm_onhyperv.h"
  #include "svm_onhyperv.h"
  
-#define __ex(x) __kvm_handle_fault_on_reboot(x)
-
  MODULE_AUTHOR("Qumranet");
  MODULE_LICENSE("GPL");
  
@@ -261,7 +259,7 @@ u32 svm_msrpm_offset(u32 msr)
  static int get_max_npt_level(void)
  {
  #ifdef CONFIG_X86_64
-       return PT64_ROOT_4LEVEL;
+       return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
  #else
         return PT32E_ROOT_LEVEL;
  #endif
@@ -462,11 +460,6 @@ static int has_svm(void)
                 return 0;
         }
  
-       if (pgtable_l5_enabled()) {
-               pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n");
-               return 0;
-       }
-
         return 1;
  }
  
@@ -1015,7 +1008,9 @@ static __init int svm_hardware_setup(void)
         if (!boot_cpu_has(X86_FEATURE_NPT))
                 npt_enabled = false;
  
-       kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
+       /* Force VM NPT level equal to the host's max NPT level */
+       kvm_configure_mmu(npt_enabled, get_max_npt_level(),
+                         get_max_npt_level(), PG_LEVEL_1G);
         pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
  
         /* Note, SEV setup consumes npt_enabled. */
@@ -1161,8 +1156,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
         struct vmcb_control_area *control = &svm->vmcb->control;
         struct vmcb_save_area *save = &svm->vmcb->save;
  
-       vcpu->arch.hflags = 0;
-
         svm_set_intercept(svm, INTERCEPT_CR0_READ);
         svm_set_intercept(svm, INTERCEPT_CR3_READ);
         svm_set_intercept(svm, INTERCEPT_CR4_READ);
@@ -1241,29 +1234,14 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
                 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
         save->cs.limit = 0xffff;
  
+       save->gdtr.base = 0;
         save->gdtr.limit = 0xffff;
+       save->idtr.base = 0;
         save->idtr.limit = 0xffff;
  
         init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
         init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
  
-       svm_set_cr4(vcpu, 0);
-       svm_set_efer(vcpu, 0);
-       save->dr6 = 0xffff0ff0;
-       kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
-       save->rip = 0x0000fff0;
-       vcpu->arch.regs[VCPU_REGS_RIP] = save->rip;
-
-       /*
-        * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
-        * It also updates the guest-visible cr0 value.
-        */
-       svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
-       kvm_mmu_reset_context(vcpu);
-
-       save->cr4 = X86_CR4_PAE;
-       /* rdx = ?? */
-
         if (npt_enabled) {
                 /* Setup VMCB for Nested Paging */
                 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
@@ -1273,14 +1251,12 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
                 svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
                 save->g_pat = vcpu->arch.pat;
                 save->cr3 = 0;
-               save->cr4 = 0;
         }
         svm->current_vmcb->asid_generation = 0;
         svm->asid = 0;
  
         svm->nested.vmcb12_gpa = INVALID_GPA;
         svm->nested.last_vmcb12_gpa = INVALID_GPA;
-       vcpu->arch.hflags = 0;
  
         if (!kvm_pause_in_guest(vcpu->kvm)) {
                 control->pause_filter_count = pause_filter_count;
@@ -1330,25 +1306,11 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
  static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
-       u32 dummy;
-       u32 eax = 1;
  
         svm->spec_ctrl = 0;
         svm->virt_spec_ctrl = 0;
  
-       if (!init_event) {
-               vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE |
-                                      MSR_IA32_APICBASE_ENABLE;
-               if (kvm_vcpu_is_reset_bsp(vcpu))
-                       vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
-       }
         init_vmcb(vcpu);
-
-       kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
-       kvm_rdx_write(vcpu, eax);
-
-       if (kvm_vcpu_apicv_active(vcpu) && !init_event)
-               avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
  }
  
  void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
@@ -1513,12 +1475,15 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                 sd->current_vmcb = svm->vmcb;
                 indirect_branch_prediction_barrier();
         }
-       avic_vcpu_load(vcpu, cpu);
+       if (kvm_vcpu_apicv_active(vcpu))
+               avic_vcpu_load(vcpu, cpu);
  }
  
  static void svm_vcpu_put(struct kvm_vcpu *vcpu)
  {
-       avic_vcpu_put(vcpu);
+       if (kvm_vcpu_apicv_active(vcpu))
+               avic_vcpu_put(vcpu);
+
         svm_prepare_host_switch(vcpu);
  
         ++vcpu->stat.host_state_reload;
@@ -1560,7 +1525,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
                 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
                 break;
         default:
-               WARN_ON_ONCE(1);
+               KVM_BUG_ON(1, vcpu->kvm);
         }
  }
  
@@ -2078,11 +2043,15 @@ static int shutdown_interception(struct kvm_vcpu *vcpu)
                 return -EINVAL;
  
         /*
-        * VMCB is undefined after a SHUTDOWN intercept
-        * so reinitialize it.
+        * VMCB is undefined after a SHUTDOWN intercept.  INIT the vCPU to put
+        * the VMCB in a known good state.  Unfortuately, KVM doesn't have
+        * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
+        * userspace.  At a platform view, INIT is acceptable behavior as
+        * there exist bare metal platforms that automatically INIT the CPU
+        * in response to shutdown.
          */
         clear_page(svm->vmcb);
-       init_vmcb(vcpu);
+       kvm_vcpu_reset(vcpu, true);
  
         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
         return 0;
@@ -2993,10 +2962,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                 svm->msr_decfg = data;
                 break;
         }
-       case MSR_IA32_APICBASE:
-               if (kvm_vcpu_apicv_active(vcpu))
-                       avic_update_vapic_bar(to_svm(vcpu), data);
-               fallthrough;
         default:
                 return kvm_set_msr_common(vcpu, msr);
         }
@@ -3021,7 +2986,7 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu)
          * In this case AVIC was temporarily disabled for
          * requesting the IRQ window and we have to re-enable it.
          */
-       svm_toggle_avic_for_irq_window(vcpu, true);
+       kvm_request_apicv_update(vcpu->kvm, true, APICV_INHIBIT_REASON_IRQWIN);
  
         ++vcpu->stat.irq_window_exits;
         return 1;
@@ -3269,12 +3234,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
                "excp_to:", save->last_excp_to);
  }
  
-static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
+static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code)
  {
-       if (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
-           svm_exit_handlers[exit_code])
-               return 0;
+       return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
+               svm_exit_handlers[exit_code]);
+}
  
+static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
+{
         vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
         dump_vmcb(vcpu);
         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -3282,14 +3249,13 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
         vcpu->run->internal.ndata = 2;
         vcpu->run->internal.data[0] = exit_code;
         vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
-
-       return -EINVAL;
+       return 0;
  }
  
  int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
  {
-       if (svm_handle_invalid_exit(vcpu, exit_code))
-               return 0;
+       if (!svm_check_exit_valid(vcpu, exit_code))
+               return svm_handle_invalid_exit(vcpu, exit_code);
  
  #ifdef CONFIG_RETPOLINE
         if (exit_code == SVM_EXIT_MSR)
@@ -3573,7 +3539,7 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
                  * via AVIC. In such case, we need to temporarily disable AVIC,
                  * and fallback to injecting IRQ via V_IRQ.
                  */
-               svm_toggle_avic_for_irq_window(vcpu, false);
+               kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_IRQWIN);
                 svm_set_vintr(svm);
         }
  }
@@ -3808,6 +3774,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
  
         pre_svm_run(vcpu);
  
+       WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
+
         sync_lapic_to_cr8(vcpu);
  
         if (unlikely(svm->asid != svm->vmcb->control.asid)) {
@@ -4610,7 +4578,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
         .set_virtual_apic_mode = svm_set_virtual_apic_mode,
         .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
         .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
-       .pre_update_apicv_exec_ctrl = svm_pre_update_apicv_exec_ctrl,
         .load_eoi_exitmap = svm_load_eoi_exitmap,
         .hwapic_irr_update = svm_hwapic_irr_update,
         .hwapic_isr_update = svm_hwapic_isr_update,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h

index bd0fe94..524d943 100644 (file)
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -503,12 +503,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
  
  #define VMCB_AVIC_APIC_BAR_MASK                0xFFFFFFFFFF000ULL
  
-static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
-{
-       svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
-       vmcb_mark_dirty(svm->vmcb, VMCB_AVIC);
-}
-
  static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
@@ -524,7 +518,6 @@ int avic_ga_log_notifier(u32 ga_tag);
  void avic_vm_destroy(struct kvm *kvm);
  int avic_vm_init(struct kvm *kvm);
  void avic_init_vmcb(struct vcpu_svm *svm);
-void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate);
  int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
  int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
  int avic_init_vcpu(struct vcpu_svm *svm);
@@ -534,7 +527,6 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu);
  void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
  void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
  bool svm_check_apicv_inhibit_reasons(ulong bit);
-void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate);
  void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
  void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
  void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h

index 8170f2a..22e2b01 100644 (file)
--- a/arch/x86/kvm/svm/svm_ops.h
+++ b/arch/x86/kvm/svm/svm_ops.h
@@ -4,7 +4,7 @@
  
  #include <linux/compiler_types.h>
  
-#include <asm/kvm_host.h>
+#include "x86.h"
  
  #define svm_asm(insn, clobber...)                              \
  do {                                                           \
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c

index 896b2a5..0dab1b7 100644 (file)
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -14,7 +14,6 @@ DEFINE_STATIC_KEY_FALSE(enable_evmcs);
  
  #if IS_ENABLED(CONFIG_HYPERV)
  
-#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
  #define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
  #define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
                 {EVMCS1_OFFSET(name), clean_field}
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h

index 2ec9b46..152ab0a 100644 (file)
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -73,8 +73,6 @@ struct evmcs_field {
  extern const struct evmcs_field vmcs_field_to_evmcs_1[];
  extern const unsigned int nr_evmcs_1_fields;
  
-#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
-
  static __always_inline int get_evmcs_offset(unsigned long field,
                                             u16 *clean_field)
  {
@@ -95,8 +93,6 @@ static __always_inline int get_evmcs_offset(unsigned long field,
         return evmcs_field->offset;
  }
  
-#undef ROL16
-
  static inline void evmcs_write64(unsigned long field, u64 value)
  {
         u16 clean_field;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c

index b3f77d1..ccb03d6 100644 (file)
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2207,7 +2207,8 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
         }
  }
  
-static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
+                                struct vmcs12 *vmcs12)
  {
         u32 exec_control;
         u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
@@ -2218,23 +2219,22 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
         /*
          * PIN CONTROLS
          */
-       exec_control = vmx_pin_based_exec_ctrl(vmx);
+       exec_control = __pin_controls_get(vmcs01);
         exec_control |= (vmcs12->pin_based_vm_exec_control &
                          ~PIN_BASED_VMX_PREEMPTION_TIMER);
  
         /* Posted interrupts setting is only taken from vmcs12.  */
-       if (nested_cpu_has_posted_intr(vmcs12)) {
+       vmx->nested.pi_pending = false;
+       if (nested_cpu_has_posted_intr(vmcs12))
                 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
-               vmx->nested.pi_pending = false;
-       } else {
+       else
                 exec_control &= ~PIN_BASED_POSTED_INTR;
-       }
         pin_controls_set(vmx, exec_control);
  
         /*
          * EXEC CONTROLS
          */
-       exec_control = vmx_exec_control(vmx); /* L0's desires */
+       exec_control = __exec_controls_get(vmcs01); /* L0's desires */
         exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
         exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
         exec_control &= ~CPU_BASED_TPR_SHADOW;
@@ -2271,10 +2271,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
          * SECONDARY EXEC CONTROLS
          */
         if (cpu_has_secondary_exec_ctrls()) {
-               exec_control = vmx->secondary_exec_control;
+               exec_control = __secondary_exec_controls_get(vmcs01);
  
                 /* Take the following fields only from vmcs12 */
                 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
                                   SECONDARY_EXEC_ENABLE_INVPCID |
                                   SECONDARY_EXEC_ENABLE_RDTSCP |
                                   SECONDARY_EXEC_XSAVES |
@@ -2282,7 +2283,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
                                   SECONDARY_EXEC_APIC_REGISTER_VIRT |
                                   SECONDARY_EXEC_ENABLE_VMFUNC |
-                                 SECONDARY_EXEC_TSC_SCALING);
+                                 SECONDARY_EXEC_TSC_SCALING |
+                                 SECONDARY_EXEC_DESC);
+
                 if (nested_cpu_has(vmcs12,
                                    CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
                         exec_control |= vmcs12->secondary_vm_exec_control;
@@ -2322,8 +2325,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
          * on the related bits (if supported by the CPU) in the hope that
          * we can avoid VMWrites during vmx_set_efer().
          */
-       exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) &
-                       ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
+       exec_control = __vm_entry_controls_get(vmcs01);
+       exec_control |= vmcs12->vm_entry_controls;
+       exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
         if (cpu_has_load_ia32_efer()) {
                 if (guest_efer & EFER_LMA)
                         exec_control |= VM_ENTRY_IA32E_MODE;
@@ -2339,9 +2343,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
          * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
          * bits may be modified by vmx_set_efer() in prepare_vmcs02().
          */
-       exec_control = vmx_vmexit_ctrl();
+       exec_control = __vm_exit_controls_get(vmcs01);
         if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
                 exec_control |= VM_EXIT_LOAD_IA32_EFER;
+       else
+               exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
         vm_exit_controls_set(vmx, exec_control);
  
         /*
@@ -3384,7 +3390,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
  
         vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
  
-       prepare_vmcs02_early(vmx, vmcs12);
+       prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
  
         if (from_vmentry) {
                 if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
@@ -4304,7 +4310,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                 seg.l = 1;
         else
                 seg.db = 1;
-       vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
+       __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
         seg = (struct kvm_segment) {
                 .base = 0,
                 .limit = 0xFFFFFFFF,
@@ -4315,17 +4321,17 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                 .g = 1
         };
         seg.selector = vmcs12->host_ds_selector;
-       vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
+       __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
         seg.selector = vmcs12->host_es_selector;
-       vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
+       __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
         seg.selector = vmcs12->host_ss_selector;
-       vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
+       __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
         seg.selector = vmcs12->host_fs_selector;
         seg.base = vmcs12->host_fs_base;
-       vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
+       __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
         seg.selector = vmcs12->host_gs_selector;
         seg.base = vmcs12->host_gs_base;
-       vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
+       __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
         seg = (struct kvm_segment) {
                 .base = vmcs12->host_tr_base,
                 .limit = 0x67,
@@ -4333,14 +4339,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                 .type = 11,
                 .present = 1
         };
-       vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+       __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+
+       memset(&seg, 0, sizeof(seg));
+       seg.unusable = 1;
+       __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
  
         kvm_set_dr(vcpu, 7, 0x400);
         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
  
-       if (cpu_has_vmx_msr_bitmap())
-               vmx_update_msr_bitmap(vcpu);
-
         if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
                                 vmcs12->vm_exit_msr_load_count))
                 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
@@ -4419,9 +4426,6 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
  
         kvm_mmu_reset_context(vcpu);
  
-       if (cpu_has_vmx_msr_bitmap())
-               vmx_update_msr_bitmap(vcpu);
-
         /*
          * This nasty bit of open coding is a compromise between blindly
          * loading L1's MSRs using the exit load lists (incorrect emulation
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c

index 9efc1a6..10cc4f6 100644 (file)
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -437,13 +437,13 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                             !(msr & MSR_PMC_FULL_WIDTH_BIT))
                                 data = (s64)(s32)data;
                         pmc->counter += data - pmc_read_counter(pmc);
-                       if (pmc->perf_event)
+                       if (pmc->perf_event && !pmc->is_paused)
                                 perf_event_period(pmc->perf_event,
                                                   get_sample_period(pmc, data));
                         return 0;
                 } else if ((pmc = get_fixed_pmc(pmu, msr))) {
                         pmc->counter += data - pmc_read_counter(pmc);
-                       if (pmc->perf_event)
+                       if (pmc->perf_event && !pmc->is_paused)
                                 perf_event_period(pmc->perf_event,
                                                   get_sample_period(pmc, data));
                         return 0;
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h

index 4b9957e..6e5de2e 100644 (file)
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -11,6 +11,8 @@
  
  #include "capabilities.h"
  
+#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
+
  struct vmcs_hdr {
         u32 revision_id:31;
         u32 shadow_vmcs:1;
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c

index d9f5d7c..cab6ba7 100644 (file)
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -2,7 +2,6 @@
  
  #include "vmcs12.h"
  
-#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
  #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
  #define FIELD(number, name)    [ROL16(number, 6)] = VMCS12_OFFSET(name)
  #define FIELD64(number, name)                                          \
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h

index 5e0e1b3..2a45f02 100644 (file)
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -364,8 +364,6 @@ static inline void vmx_check_vmcs12_offsets(void)
  extern const unsigned short vmcs_field_to_offset_table[];
  extern const unsigned int nr_vmcs12_fields;
  
-#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
-
  static inline short vmcs_field_to_offset(unsigned long field)
  {
         unsigned short offset;
@@ -385,8 +383,6 @@ static inline short vmcs_field_to_offset(unsigned long field)
         return offset;
  }
  
-#undef ROL16
-
  static inline u64 vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field,
                                   u16 offset)
  {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c

index 927a552..0c2c0d5 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -136,8 +136,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
  #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
  #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
  #define KVM_VM_CR0_ALWAYS_ON                           \
-       (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST |      \
-        X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
+       (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
  
  #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
  #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
@@ -1648,11 +1647,12 @@ static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
  }
  
  /*
- * Set up the vmcs to automatically save and restore system
- * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
- * mode, as fiddling with msrs is very expensive.
+ * Configuring user return MSRs to automatically save, load, and restore MSRs
+ * that need to be shoved into hardware when running the guest.  Note, omitting
+ * an MSR here does _NOT_ mean it's not emulated, only that it will not be
+ * loaded into hardware when running the guest.
   */
-static void setup_msrs(struct vcpu_vmx *vmx)
+static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
  {
  #ifdef CONFIG_X86_64
         bool load_syscall_msrs;
@@ -1682,9 +1682,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
          */
         vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
  
-       if (cpu_has_vmx_msr_bitmap())
-               vmx_update_msr_bitmap(&vmx->vcpu);
-
         /*
          * The set of MSRs to load may have changed, reload MSRs before the
          * next VM-Enter.
@@ -2263,8 +2260,11 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
                 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
                 break;
         case VCPU_EXREG_CR3:
-               if (is_unrestricted_guest(vcpu) ||
-                   (enable_ept && is_paging(vcpu)))
+               /*
+                * When intercepting CR3 loads, e.g. for shadowing paging, KVM's
+                * CR3 is loaded into hardware, not the guest's CR3.
+                */
+               if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
                         vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
                 break;
         case VCPU_EXREG_CR4:
@@ -2274,7 +2274,7 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
                 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
                 break;
         default:
-               WARN_ON_ONCE(1);
+               KVM_BUG_ON(1, vcpu->kvm);
                 break;
         }
  }
@@ -2733,7 +2733,7 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
                 save->dpl = save->selector & SEGMENT_RPL_MASK;
                 save->s = 1;
         }
-       vmx_set_segment(vcpu, save, seg);
+       __vmx_set_segment(vcpu, save, seg);
  }
  
  static void enter_pmode(struct kvm_vcpu *vcpu)
@@ -2754,7 +2754,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
  
         vmx->rmode.vm86_active = 0;
  
-       vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
+       __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
  
         flags = vmcs_readl(GUEST_RFLAGS);
         flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
@@ -2852,8 +2852,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
         fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
         fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
         fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
-
-       kvm_mmu_reset_context(vcpu);
  }
  
  int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -2874,7 +2872,7 @@ int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
  
                 msr->data = efer & ~EFER_LME;
         }
-       setup_msrs(vmx);
+       vmx_setup_uret_msrs(vmx);
         return 0;
  }
  
@@ -2997,42 +2995,24 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
         kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
  }
  
-static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
-                                       unsigned long cr0,
-                                       struct kvm_vcpu *vcpu)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-       if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
-               vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
-       if (!(cr0 & X86_CR0_PG)) {
-               /* From paging/starting to nonpaging */
-               exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
-                                         CPU_BASED_CR3_STORE_EXITING);
-               vcpu->arch.cr0 = cr0;
-               vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
-       } else if (!is_paging(vcpu)) {
-               /* From nonpaging to paging */
-               exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
-                                           CPU_BASED_CR3_STORE_EXITING);
-               vcpu->arch.cr0 = cr0;
-               vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
-       }
-
-       if (!(cr0 & X86_CR0_WP))
-               *hw_cr0 &= ~X86_CR0_WP;
-}
+#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
+                         CPU_BASED_CR3_STORE_EXITING)
  
  void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
-       unsigned long hw_cr0;
+       unsigned long hw_cr0, old_cr0_pg;
+       u32 tmp;
+
+       old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
  
         hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
         if (is_unrestricted_guest(vcpu))
                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
         else {
                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
+               if (!enable_ept)
+                       hw_cr0 |= X86_CR0_WP;
  
                 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
                         enter_pmode(vcpu);
@@ -3041,22 +3021,60 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
                         enter_rmode(vcpu);
         }
  
+       vmcs_writel(CR0_READ_SHADOW, cr0);
+       vmcs_writel(GUEST_CR0, hw_cr0);
+       vcpu->arch.cr0 = cr0;
+       kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
+
  #ifdef CONFIG_X86_64
         if (vcpu->arch.efer & EFER_LME) {
-               if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
+               if (!old_cr0_pg && (cr0 & X86_CR0_PG))
                         enter_lmode(vcpu);
-               if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
+               else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
                         exit_lmode(vcpu);
         }
  #endif
  
-       if (enable_ept && !is_unrestricted_guest(vcpu))
-               ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
+       if (enable_ept && !is_unrestricted_guest(vcpu)) {
+               /*
+                * Ensure KVM has an up-to-date snapshot of the guest's CR3.  If
+                * the below code _enables_ CR3 exiting, vmx_cache_reg() will
+                * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
+                * KVM's CR3 is installed.
+                */
+               if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
+                       vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
  
-       vmcs_writel(CR0_READ_SHADOW, cr0);
-       vmcs_writel(GUEST_CR0, hw_cr0);
-       vcpu->arch.cr0 = cr0;
-       kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
+               /*
+                * When running with EPT but not unrestricted guest, KVM must
+                * intercept CR3 accesses when paging is _disabled_.  This is
+                * necessary because restricted guests can't actually run with
+                * paging disabled, and so KVM stuffs its own CR3 in order to
+                * run the guest when identity mapped page tables.
+                *
+                * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
+                * update, it may be stale with respect to CR3 interception,
+                * e.g. after nested VM-Enter.
+                *
+                * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
+                * stores to forward them to L1, even if KVM does not need to
+                * intercept them to preserve its identity mapped page tables.
+                */
+               if (!(cr0 & X86_CR0_PG)) {
+                       exec_controls_setbit(vmx, CR3_EXITING_BITS);
+               } else if (!is_guest_mode(vcpu)) {
+                       exec_controls_clearbit(vmx, CR3_EXITING_BITS);
+               } else {
+                       tmp = exec_controls_get(vmx);
+                       tmp &= ~CR3_EXITING_BITS;
+                       tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
+                       exec_controls_set(vmx, tmp);
+               }
+
+               /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
+               if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
+                       vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
+       }
  
         /* depends on vcpu->arch.cr0 to be set to a new value */
         vmx->emulation_required = emulation_required(vcpu);
@@ -3271,7 +3289,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
         return ar;
  }
  
-void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -3284,7 +3302,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
                         vmcs_write16(sf->selector, var->selector);
                 else if (var->s)
                         fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
-               goto out;
+               return;
         }
  
         vmcs_writel(sf->base, var->base);
@@ -3306,9 +3324,13 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
                 var->type |= 0x1; /* Accessed */
  
         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
+}
  
-out:
-       vmx->emulation_required = emulation_required(vcpu);
+static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+{
+       __vmx_set_segment(vcpu, var, seg);
+
+       to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
  }
  
  static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -3790,21 +3812,6 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
                 vmx_set_msr_bitmap_write(msr_bitmap, msr);
  }
  
-static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
-{
-       u8 mode = 0;
-
-       if (cpu_has_secondary_exec_ctrls() &&
-           (secondary_exec_controls_get(to_vmx(vcpu)) &
-            SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
-               mode |= MSR_BITMAP_MODE_X2APIC;
-               if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
-                       mode |= MSR_BITMAP_MODE_X2APIC_APICV;
-       }
-
-       return mode;
-}
-
  static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode)
  {
         unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
@@ -3822,11 +3829,29 @@ static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode)
         }
  }
  
-static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode)
+static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
  {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u8 mode;
+
         if (!cpu_has_vmx_msr_bitmap())
                 return;
  
+       if (cpu_has_secondary_exec_ctrls() &&
+           (secondary_exec_controls_get(vmx) &
+            SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
+               mode = MSR_BITMAP_MODE_X2APIC;
+               if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
+                       mode |= MSR_BITMAP_MODE_X2APIC_APICV;
+       } else {
+               mode = 0;
+       }
+
+       if (mode == vmx->x2apic_msr_bitmap_mode)
+               return;
+
+       vmx->x2apic_msr_bitmap_mode = mode;
+
         vmx_reset_x2apic_msrs(vcpu, mode);
  
         /*
@@ -3843,21 +3868,6 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode)
         }
  }
  
-void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       u8 mode = vmx_msr_bitmap_mode(vcpu);
-       u8 changed = mode ^ vmx->msr_bitmap_mode;
-
-       if (!changed)
-               return;
-
-       if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
-               vmx_update_msr_bitmap_x2apic(vcpu, mode);
-
-       vmx->msr_bitmap_mode = mode;
-}
-
  void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3914,7 +3924,6 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
         }
  
         pt_update_intercept_for_msr(vcpu);
-       vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu));
  }
  
  static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
@@ -4086,7 +4095,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
         vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
  }
  
-u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
+static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
  {
         u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
  
@@ -4102,6 +4111,30 @@ u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
         return pin_based_exec_ctrl;
  }
  
+static u32 vmx_vmentry_ctrl(void)
+{
+       u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
+
+       if (vmx_pt_mode_is_system())
+               vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
+                                 VM_ENTRY_LOAD_IA32_RTIT_CTL);
+       /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
+       return vmentry_ctrl &
+               ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER);
+}
+
+static u32 vmx_vmexit_ctrl(void)
+{
+       u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
+
+       if (vmx_pt_mode_is_system())
+               vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
+                                VM_EXIT_CLEAR_IA32_RTIT_CTL);
+       /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
+       return vmexit_ctrl &
+               ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
+}
+
  static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4118,11 +4151,10 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
                                         SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
         }
  
-       if (cpu_has_vmx_msr_bitmap())
-               vmx_update_msr_bitmap(vcpu);
+       vmx_update_msr_bitmap_x2apic(vcpu);
  }
  
-u32 vmx_exec_control(struct vcpu_vmx *vmx)
+static u32 vmx_exec_control(struct vcpu_vmx *vmx)
  {
         u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
  
@@ -4204,7 +4236,7 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
  #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
         vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
  
-static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
+static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
  {
         struct kvm_vcpu *vcpu = &vmx->vcpu;
  
@@ -4290,7 +4322,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
         if (!vcpu->kvm->arch.bus_lock_detection_enabled)
                 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
  
-       vmx->secondary_exec_control = exec_control;
+       return exec_control;
  }
  
  #define VMX_XSS_EXIT_BITMAP 0
@@ -4314,10 +4346,8 @@ static void init_vmcs(struct vcpu_vmx *vmx)
  
         exec_controls_set(vmx, vmx_exec_control(vmx));
  
-       if (cpu_has_secondary_exec_ctrls()) {
-               vmx_compute_secondary_exec_control(vmx);
-               secondary_exec_controls_set(vmx, vmx->secondary_exec_control);
-       }
+       if (cpu_has_secondary_exec_ctrls())
+               secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
  
         if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
                 vmcs_write64(EOI_EXIT_BITMAP0, 0);
@@ -4388,32 +4418,35 @@ static void init_vmcs(struct vcpu_vmx *vmx)
                 vmx->pt_desc.guest.output_mask = 0x7F;
                 vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
         }
+
+       vmcs_write32(GUEST_SYSENTER_CS, 0);
+       vmcs_writel(GUEST_SYSENTER_ESP, 0);
+       vmcs_writel(GUEST_SYSENTER_EIP, 0);
+       vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+       if (cpu_has_vmx_tpr_shadow()) {
+               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+               if (cpu_need_tpr_shadow(&vmx->vcpu))
+                       vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+                                    __pa(vmx->vcpu.arch.apic->regs));
+               vmcs_write32(TPR_THRESHOLD, 0);
+       }
+
+       vmx_setup_uret_msrs(vmx);
  }
  
  static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
-       struct msr_data apic_base_msr;
-       u64 cr0;
  
         vmx->rmode.vm86_active = 0;
         vmx->spec_ctrl = 0;
  
         vmx->msr_ia32_umwait_control = 0;
  
-       vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
         vmx->hv_deadline_tsc = -1;
         kvm_set_cr8(vcpu, 0);
  
-       if (!init_event) {
-               apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
-                                    MSR_IA32_APICBASE_ENABLE;
-               if (kvm_vcpu_is_reset_bsp(vcpu))
-                       apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
-               apic_base_msr.host_initiated = true;
-               kvm_set_apic_base(vcpu, &apic_base_msr);
-       }
-
         vmx_segment_cache_clear(vmx);
  
         seg_setup(VCPU_SREG_CS);
@@ -4436,16 +4469,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
  
-       if (!init_event) {
-               vmcs_write32(GUEST_SYSENTER_CS, 0);
-               vmcs_writel(GUEST_SYSENTER_ESP, 0);
-               vmcs_writel(GUEST_SYSENTER_EIP, 0);
-               vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
-       }
-
-       kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
-       kvm_rip_write(vcpu, 0xfff0);
-
         vmcs_writel(GUEST_GDTR_BASE, 0);
         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
  
@@ -4458,31 +4481,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
         if (kvm_mpx_supported())
                 vmcs_write64(GUEST_BNDCFGS, 0);
  
-       setup_msrs(vmx);
-
         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
  
-       if (cpu_has_vmx_tpr_shadow() && !init_event) {
-               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
-               if (cpu_need_tpr_shadow(vcpu))
-                       vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
-                                    __pa(vcpu->arch.apic->regs));
-               vmcs_write32(TPR_THRESHOLD, 0);
-       }
-
         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
  
-       cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
-       vmx->vcpu.arch.cr0 = cr0;
-       vmx_set_cr0(vcpu, cr0); /* enter rmode */
-       vmx_set_cr4(vcpu, 0);
-       vmx_set_efer(vcpu, 0);
-
-       vmx_update_exception_bitmap(vcpu);
-
         vpid_sync_context(vmx->vpid);
-       if (init_event)
-               vmx_clear_hlt(vcpu);
  }
  
  static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
@@ -4996,6 +4999,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
                         return kvm_complete_insn_gp(vcpu, err);
                 case 3:
                         WARN_ON_ONCE(enable_unrestricted_guest);
+
                         err = kvm_set_cr3(vcpu, val);
                         return kvm_complete_insn_gp(vcpu, err);
                 case 4:
@@ -5021,14 +5025,13 @@ static int handle_cr(struct kvm_vcpu *vcpu)
                 }
                 break;
         case 2: /* clts */
-               WARN_ONCE(1, "Guest should always own CR0.TS");
-               vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
-               trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
-               return kvm_skip_emulated_instruction(vcpu);
+               KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
+               return -EIO;
         case 1: /*mov from cr*/
                 switch (cr) {
                 case 3:
                         WARN_ON_ONCE(enable_unrestricted_guest);
+
                         val = kvm_read_cr3(vcpu);
                         kvm_register_write(vcpu, reg, val);
                         trace_kvm_cr_read(cr, val);
@@ -5129,6 +5132,12 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
  
         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
+
+       /*
+        * exc_debug expects dr6 to be cleared after it runs, avoid that it sees
+        * a stale dr6 from the guest.
+        */
+       set_debugreg(DR6_RESERVED, 6);
  }
  
  static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
@@ -5338,7 +5347,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
  
  static int handle_nmi_window(struct kvm_vcpu *vcpu)
  {
-       WARN_ON_ONCE(!enable_vnmi);
+       if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
+               return -EIO;
+
         exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
         ++vcpu->stat.nmi_window_exits;
         kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -5896,7 +5907,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
          * below) should never happen as that means we incorrectly allowed a
          * nested VM-Enter with an invalid vmcs12.
          */
-       WARN_ON_ONCE(vmx->nested.nested_run_pending);
+       if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
+               return -EIO;
  
         /* If guest state is invalid, start emulating */
         if (vmx->emulation_required)
@@ -6189,7 +6201,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
         }
         secondary_exec_controls_set(vmx, sec_exec_control);
  
-       vmx_update_msr_bitmap(vcpu);
+       vmx_update_msr_bitmap_x2apic(vcpu);
  }
  
  static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
@@ -6274,7 +6286,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
         int max_irr;
         bool max_irr_updated;
  
-       WARN_ON(!vcpu->arch.apicv_active);
+       if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm))
+               return -EIO;
+
         if (pi_test_on(&vmx->pi_desc)) {
                 pi_clear_on(&vmx->pi_desc);
                 /*
@@ -6357,7 +6371,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
         unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
         gate_desc *desc = (gate_desc *)host_idt_base + vector;
  
-       if (WARN_ONCE(!is_external_intr(intr_info),
+       if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
             "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
                 return;
  
@@ -6368,6 +6382,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
  
+       if (vmx->emulation_required)
+               return;
+
         if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
                 handle_external_interrupt_irqoff(vcpu);
         else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
@@ -6639,6 +6656,10 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
                 vmx->loaded_vmcs->host_state.cr4 = cr4;
         }
  
+       /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+       if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+               set_debugreg(vcpu->arch.dr6, 6);
+
         /* When single-stepping over STI and MOV SS, we must clear the
          * corresponding interruptibility bits in the guest state. Otherwise
          * vmentry fails as it then expects bit 14 (BS) in pending debug
@@ -6838,7 +6859,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
         }
-       vmx->msr_bitmap_mode = 0;
  
         vmx->loaded_vmcs = &vmx->vmcs01;
         cpu = get_cpu();
@@ -6997,7 +7017,7 @@ exit:
         return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
  }
  
-static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
+static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
  {
         /*
          * These bits in the secondary execution controls field
@@ -7011,7 +7031,6 @@ static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                 SECONDARY_EXEC_DESC;
  
-       u32 new_ctl = vmx->secondary_exec_control;
         u32 cur_ctl = secondary_exec_controls_get(vmx);
  
         secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
@@ -7154,10 +7173,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
         /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
         vcpu->arch.xsaves_enabled = false;
  
-       if (cpu_has_secondary_exec_ctrls()) {
-               vmx_compute_secondary_exec_control(vmx);
-               vmcs_set_secondary_exec_control(vmx);
-       }
+       vmx_setup_uret_msrs(vmx);
+
+       if (cpu_has_secondary_exec_ctrls())
+               vmcs_set_secondary_exec_control(vmx,
+                                               vmx_secondary_exec_control(vmx));
  
         if (nested_vmx_allowed(vcpu))
                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
@@ -7803,7 +7823,8 @@ static __init int hardware_setup(void)
                 ept_lpage_level = PG_LEVEL_2M;
         else
                 ept_lpage_level = PG_LEVEL_4K;
-       kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level);
+       kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
+                         ept_lpage_level);
  
         /*
          * Only enable PML when hardware supports PML feature, and both EPT
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h

index 17a1cb4..4858c5f 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -227,7 +227,7 @@ struct nested_vmx {
  struct vcpu_vmx {
         struct kvm_vcpu       vcpu;
         u8                    fail;
-       u8                    msr_bitmap_mode;
+       u8                    x2apic_msr_bitmap_mode;
  
         /*
          * If true, host state has been stored in vmx->loaded_vmcs for
@@ -263,8 +263,6 @@ struct vcpu_vmx {
         u64                   spec_ctrl;
         u32                   msr_ia32_umwait_control;
  
-       u32 secondary_exec_control;
-
         /*
          * loaded_vmcs points to the VMCS currently used in this vcpu. For a
          * non-nested (L1) guest, it always points to vmcs01. For a nested
@@ -371,12 +369,11 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
  void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
  void ept_save_pdptrs(struct kvm_vcpu *vcpu);
  void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
  u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
  
  bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu);
  void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
-void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
  bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
  bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
  bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
@@ -419,9 +416,13 @@ static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val)         \
                 vmx->loaded_vmcs->controls_shadow.lname = val;              \
         }                                                                   \
  }                                                                          \
+static inline u32 __##lname##_controls_get(struct loaded_vmcs *vmcs)       \
+{                                                                          \
+       return vmcs->controls_shadow.lname;                                 \
+}                                                                          \
  static inline u32 lname##_controls_get(struct vcpu_vmx *vmx)               \
  {                                                                          \
-       return vmx->loaded_vmcs->controls_shadow.lname;                     \
+       return __##lname##_controls_get(vmx->loaded_vmcs);                  \
  }                                                                          \
  static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val)   \
  {                                                                          \
@@ -451,31 +452,6 @@ static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu)
         vcpu->arch.regs_dirty = 0;
  }
  
-static inline u32 vmx_vmentry_ctrl(void)
-{
-       u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
-       if (vmx_pt_mode_is_system())
-               vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
-                                 VM_ENTRY_LOAD_IA32_RTIT_CTL);
-       /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
-       return vmentry_ctrl &
-               ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER);
-}
-
-static inline u32 vmx_vmexit_ctrl(void)
-{
-       u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
-       if (vmx_pt_mode_is_system())
-               vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
-                                VM_EXIT_CLEAR_IA32_RTIT_CTL);
-       /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
-       return vmexit_ctrl &
-               ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
-}
-
-u32 vmx_exec_control(struct vcpu_vmx *vmx);
-u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx);
-
  static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
  {
         return container_of(kvm, struct kvm_vmx, kvm);
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h

index 164b64f..9e9ef47 100644 (file)
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -4,13 +4,11 @@
  
  #include <linux/nospec.h>
  
-#include <asm/kvm_host.h>
  #include <asm/vmx.h>
  
  #include "evmcs.h"
  #include "vmcs.h"
-
-#define __ex(x) __kvm_handle_fault_on_reboot(x)
+#include "x86.h"
  
  asmlinkage void vmread_error(unsigned long field, bool fault);
  __attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index e5d5c5e..28ef141 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -233,12 +233,13 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
         STATS_DESC_COUNTER(VM, mmu_recycled),
         STATS_DESC_COUNTER(VM, mmu_cache_miss),
         STATS_DESC_ICOUNTER(VM, mmu_unsync),
-       STATS_DESC_ICOUNTER(VM, lpages),
+       STATS_DESC_ICOUNTER(VM, pages_4k),
+       STATS_DESC_ICOUNTER(VM, pages_2m),
+       STATS_DESC_ICOUNTER(VM, pages_1g),
         STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
+       STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size),
         STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
  };
-static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
-               sizeof(struct kvm_vm_stat) / sizeof(u64));
  
  const struct kvm_stats_header kvm_vm_stats_header = {
         .name_size = KVM_STATS_NAME_SIZE,
@@ -278,8 +279,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
         STATS_DESC_COUNTER(VCPU, directed_yield_successful),
         STATS_DESC_ICOUNTER(VCPU, guest_mode)
  };
-static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
-               sizeof(struct kvm_vcpu_stat) / sizeof(u64));
  
  const struct kvm_stats_header kvm_vcpu_stats_header = {
         .name_size = KVM_STATS_NAME_SIZE,
@@ -485,7 +484,14 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
  }
  EXPORT_SYMBOL_GPL(kvm_set_apic_base);
  
-asmlinkage __visible noinstr void kvm_spurious_fault(void)
+/*
+ * Handle a fault on a hardware virtualization (VMX or SVM) instruction.
+ *
+ * Hardware virtualization extension instructions may fault if a reboot turns
+ * off virtualization while processes are running.  Usually after catching the
+ * fault we just panic; during reboot instead the instruction is ignored.
+ */
+noinstr void kvm_spurious_fault(void)
  {
         /* Fault while not rebooting.  We want the trace. */
         BUG_ON(!kvm_rebooting);
@@ -1180,7 +1186,6 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
         if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
                 for (i = 0; i < KVM_NR_DB_REGS; i++)
                         vcpu->arch.eff_db[i] = vcpu->arch.db[i];
-               vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
         }
  }
  
@@ -3316,6 +3321,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                         if (!msr_info->host_initiated) {
                                 s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
                                 adjust_tsc_offset_guest(vcpu, adj);
+                               /* Before back to guest, tsc_timestamp must be adjusted
+                                * as well, otherwise guest's percpu pvclock time could jump.
+                                */
+                               kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
                         }
                         vcpu->arch.ia32_tsc_adjust_msr = data;
                 }
@@ -4310,12 +4319,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
  
         static_call(kvm_x86_vcpu_put)(vcpu);
         vcpu->arch.last_host_tsc = rdtsc();
-       /*
-        * If userspace has set any breakpoints or watchpoints, dr6 is restored
-        * on every vmexit, but if not, we might have a stale dr6 from the
-        * guest. do_debug expects dr6 to be cleared after it runs, do the same.
-        */
-       set_debugreg(0, 6);
  }
  
  static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -6567,9 +6570,9 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
          * there is no pkey in EPT page table for L1 guest or EPT
          * shadow page table for L2 guest.
          */
-       if (vcpu_match_mmio_gva(vcpu, gva)
-           && !permission_fault(vcpu, vcpu->arch.walk_mmu,
-                                vcpu->arch.mmio_access, 0, access)) {
+       if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) ||
+           !permission_fault(vcpu, vcpu->arch.walk_mmu,
+                             vcpu->arch.mmio_access, 0, access))) {
                 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
                                         (gva & (PAGE_SIZE - 1));
                 trace_vcpu_match_mmio(gva, *gpa, write, false);
@@ -8578,6 +8581,8 @@ EXPORT_SYMBOL_GPL(kvm_apicv_activated);
  
  static void kvm_apicv_init(struct kvm *kvm)
  {
+       mutex_init(&kvm->arch.apicv_update_lock);
+
         if (enable_apicv)
                 clear_bit(APICV_INHIBIT_REASON_DISABLE,
                           &kvm->arch.apicv_inhibit_reasons);
@@ -8891,6 +8896,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
                 can_inject = false;
         }
  
+       /* Don't inject interrupts if the user asked to avoid doing so */
+       if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ)
+               return 0;
+
         /*
          * Finally, inject interrupt events.  If an event cannot be injected
          * due to architectural conditions (e.g. IF=0) a window-open exit
@@ -9236,10 +9245,18 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm)
  
  void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
  {
+       bool activate;
+
         if (!lapic_in_kernel(vcpu))
                 return;
  
-       vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
+       mutex_lock(&vcpu->kvm->arch.apicv_update_lock);
+
+       activate = kvm_apicv_activated(vcpu->kvm);
+       if (vcpu->arch.apicv_active == activate)
+               goto out;
+
+       vcpu->arch.apicv_active = activate;
         kvm_apic_update_apicv(vcpu);
         static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
  
@@ -9251,54 +9268,45 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
          */
         if (!vcpu->arch.apicv_active)
                 kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+out:
+       mutex_unlock(&vcpu->kvm->arch.apicv_update_lock);
  }
  EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
  
-/*
- * NOTE: Do not hold any lock prior to calling this.
- *
- * In particular, kvm_request_apicv_update() expects kvm->srcu not to be
- * locked, because it calls __x86_set_memory_region() which does
- * synchronize_srcu(&kvm->srcu).
- */
-void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
+void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
  {
-       struct kvm_vcpu *except;
-       unsigned long old, new, expected;
+       unsigned long old, new;
  
         if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
             !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit))
                 return;
  
-       old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
-       do {
-               expected = new = old;
-               if (activate)
-                       __clear_bit(bit, &new);
-               else
-                       __set_bit(bit, &new);
-               if (new == old)
-                       break;
-               old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new);
-       } while (old != expected);
-
-       if (!!old == !!new)
-               return;
+       old = new = kvm->arch.apicv_inhibit_reasons;
  
-       trace_kvm_apicv_update_request(activate, bit);
-       if (kvm_x86_ops.pre_update_apicv_exec_ctrl)
-               static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate);
+       if (activate)
+               __clear_bit(bit, &new);
+       else
+               __set_bit(bit, &new);
+
+       if (!!old != !!new) {
+               trace_kvm_apicv_update_request(activate, bit);
+               kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
+               kvm->arch.apicv_inhibit_reasons = new;
+               if (new) {
+                       unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE);
+                       kvm_zap_gfn_range(kvm, gfn, gfn+1);
+               }
+       } else
+               kvm->arch.apicv_inhibit_reasons = new;
+}
+EXPORT_SYMBOL_GPL(__kvm_request_apicv_update);
  
-       /*
-        * Sending request to update APICV for all other vcpus,
-        * while update the calling vcpu immediately instead of
-        * waiting for another #VMEXIT to handle the request.
-        */
-       except = kvm_get_running_vcpu();
-       kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE,
-                                        except);
-       if (except)
-               kvm_vcpu_update_apicv(except);
+void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
+{
+       mutex_lock(&kvm->arch.apicv_update_lock);
+       __kvm_request_apicv_update(kvm, activate, bit);
+       mutex_unlock(&kvm->arch.apicv_update_lock);
  }
  EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
  
@@ -9395,6 +9403,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
         }
  
         if (kvm_request_pending(vcpu)) {
+               if (kvm_check_request(KVM_REQ_VM_BUGGED, vcpu)) {
+                       r = -EIO;
+                       goto out;
+               }
                 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
                         if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
                                 r = 0;
@@ -9608,8 +9620,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                 set_debugreg(vcpu->arch.eff_db[1], 1);
                 set_debugreg(vcpu->arch.eff_db[2], 2);
                 set_debugreg(vcpu->arch.eff_db[3], 3);
-               set_debugreg(vcpu->arch.dr6, 6);
-               vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
         } else if (unlikely(hw_breakpoint_active())) {
                 set_debugreg(0, 7);
         }
@@ -9639,7 +9649,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                 static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
                 kvm_update_dr0123(vcpu);
                 kvm_update_dr7(vcpu);
-               vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
         }
  
         /*
@@ -9976,7 +9985,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                 goto out;
         }
  
-       if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
+       if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) ||
+           (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) {
                 r = -EINVAL;
                 goto out;
         }
@@ -10581,9 +10591,6 @@ static void store_regs(struct kvm_vcpu *vcpu)
  
  static int sync_regs(struct kvm_vcpu *vcpu)
  {
-       if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
-               return -EINVAL;
-
         if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
                 __set_regs(vcpu, &vcpu->run->s.regs.regs);
                 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
@@ -10799,6 +10806,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
  void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
  {
         unsigned long old_cr0 = kvm_read_cr0(vcpu);
+       unsigned long new_cr0;
+       u32 eax, dummy;
  
         kvm_lapic_reset(vcpu, init_event);
  
@@ -10865,10 +10874,41 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
         vcpu->arch.regs_avail = ~0;
         vcpu->arch.regs_dirty = ~0;
  
+       /*
+        * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon)
+        * if no CPUID match is found.  Note, it's impossible to get a match at
+        * RESET since KVM emulates RESET before exposing the vCPU to userspace,
+        * i.e. it'simpossible for kvm_cpuid() to find a valid entry on RESET.
+        * But, go through the motions in case that's ever remedied.
+        */
+       eax = 1;
+       if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true))
+               eax = 0x600;
+       kvm_rdx_write(vcpu, eax);
+
         vcpu->arch.ia32_xss = 0;
  
         static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
  
+       kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+       kvm_rip_write(vcpu, 0xfff0);
+
+       /*
+        * CR0.CD/NW are set on RESET, preserved on INIT.  Note, some versions
+        * of Intel's SDM list CD/NW as being set on INIT, but they contradict
+        * (or qualify) that with a footnote stating that CD/NW are preserved.
+        */
+       new_cr0 = X86_CR0_ET;
+       if (init_event)
+               new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD));
+       else
+               new_cr0 |= X86_CR0_NW | X86_CR0_CD;
+
+       static_call(kvm_x86_set_cr0)(vcpu, new_cr0);
+       static_call(kvm_x86_set_cr4)(vcpu, 0);
+       static_call(kvm_x86_set_efer)(vcpu, 0);
+       static_call(kvm_x86_update_exception_bitmap)(vcpu);
+
         /*
          * Reset the MMU context if paging was enabled prior to INIT (which is
          * implied if CR0.PG=1 as CR0 will be '0' prior to RESET).  Unlike the
@@ -10879,7 +10919,20 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
          */
         if (old_cr0 & X86_CR0_PG)
                 kvm_mmu_reset_context(vcpu);
+
+       /*
+        * Intel's SDM states that all TLB entries are flushed on INIT.  AMD's
+        * APM states the TLBs are untouched by INIT, but it also states that
+        * the TLBs are flushed on "External initialization of the processor."
+        * Flush the guest TLB regardless of vendor, there is no meaningful
+        * benefit in relying on the guest to flush the TLB immediately after
+        * INIT.  A spurious TLB flush is benign and likely negligible from a
+        * performance perspective.
+        */
+       if (init_event)
+               kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
  }
+EXPORT_SYMBOL_GPL(kvm_vcpu_reset);
  
  void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
  {
@@ -11123,6 +11176,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
         kvm_hv_init_vm(kvm);
         kvm_page_track_init(kvm);
         kvm_mmu_init_vm(kvm);
+       kvm_xen_init_vm(kvm);
  
         return static_call(kvm_x86_vm_init)(kvm);
  }
@@ -11312,8 +11366,7 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
  
         for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
                 int level = i + 1;
-               int lpages = gfn_to_index(slot->base_gfn + npages - 1,
-                                         slot->base_gfn, level) + 1;
+               int lpages = __kvm_mmu_slot_lpages(slot, npages, level);
  
                 WARN_ON(slot->arch.rmap[i]);
  
@@ -11396,8 +11449,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
                 int lpages;
                 int level = i + 1;
  
-               lpages = gfn_to_index(slot->base_gfn + npages - 1,
-                                     slot->base_gfn, level) + 1;
+               lpages = __kvm_mmu_slot_lpages(slot, npages, level);
  
                 linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
                 if (!linfo)
@@ -11481,7 +11533,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
  
  static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
                                      struct kvm_memory_slot *old,
-                                    struct kvm_memory_slot *new,
+                                    const struct kvm_memory_slot *new,
                                      enum kvm_mr_change change)
  {
         bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES;
@@ -11561,10 +11613,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
                 kvm_mmu_change_mmu_pages(kvm,
                                 kvm_mmu_calculate_default_mmu_pages(kvm));
  
-       /*
-        * FIXME: const-ify all uses of struct kvm_memory_slot.
-        */
-       kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change);
+       kvm_mmu_slot_apply_flags(kvm, old, new, change);
  
         /* Free the arrays associated with the old memslot. */
         if (change == KVM_MR_MOVE)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h

index 44ae103..7d66d63 100644 (file)
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,6 +8,8 @@
  #include "kvm_cache_regs.h"
  #include "kvm_emulate.h"
  
+void kvm_spurious_fault(void);
+
  static __always_inline void kvm_guest_enter_irqoff(void)
  {
         /*
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c

index ae17250..9ea9c3d 100644 (file)
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -25,15 +25,14 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
  {
         gpa_t gpa = gfn_to_gpa(gfn);
         int wc_ofs, sec_hi_ofs;
-       int ret;
+       int ret = 0;
         int idx = srcu_read_lock(&kvm->srcu);
  
-       ret = kvm_gfn_to_hva_cache_init(kvm, &kvm->arch.xen.shinfo_cache,
-                                       gpa, PAGE_SIZE);
-       if (ret)
+       if (kvm_is_error_hva(gfn_to_hva(kvm, gfn))) {
+               ret = -EFAULT;
                 goto out;
-
-       kvm->arch.xen.shinfo_set = true;
+       }
+       kvm->arch.xen.shinfo_gfn = gfn;
  
         /* Paranoia checks on the 32-bit struct layout */
         BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
@@ -245,7 +244,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
  
         case KVM_XEN_ATTR_TYPE_SHARED_INFO:
                 if (data->u.shared_info.gfn == GPA_INVALID) {
-                       kvm->arch.xen.shinfo_set = false;
+                       kvm->arch.xen.shinfo_gfn = GPA_INVALID;
                         r = 0;
                         break;
                 }
@@ -283,10 +282,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
                 break;
  
         case KVM_XEN_ATTR_TYPE_SHARED_INFO:
-               if (kvm->arch.xen.shinfo_set)
-                       data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
-               else
-                       data->u.shared_info.gfn = GPA_INVALID;
+               data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_gfn);
                 r = 0;
                 break;
  
@@ -646,6 +642,11 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
         return 0;
  }
  
+void kvm_xen_init_vm(struct kvm *kvm)
+{
+       kvm->arch.xen.shinfo_gfn = GPA_INVALID;
+}
+
  void kvm_xen_destroy_vm(struct kvm *kvm)
  {
         if (kvm->arch.xen_hvm_config.msr)
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h

index 463a784..cc0cf5f 100644 (file)
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -21,6 +21,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
  int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
  int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data);
  int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
+void kvm_xen_init_vm(struct kvm *kvm);
  void kvm_xen_destroy_vm(struct kvm *kvm);
  
  static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
@@ -50,6 +51,10 @@ static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
         return 1;
  }
  
+static inline void kvm_xen_init_vm(struct kvm *kvm)
+{
+}
+
  static inline void kvm_xen_destroy_vm(struct kvm *kvm)
  {
  }
diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h

index 136b8d9..0d7865a 100644 (file)
--- a/include/linux/entry-kvm.h
+++ b/include/linux/entry-kvm.h
@@ -2,7 +2,11 @@
  #ifndef __LINUX_ENTRYKVM_H
  #define __LINUX_ENTRYKVM_H
  
-#include <linux/entry-common.h>
+#include <linux/static_call_types.h>
+#include <linux/tracehook.h>
+#include <linux/syscalls.h>
+#include <linux/seccomp.h>
+#include <linux/sched.h>
  #include <linux/tick.h>
  
  /* Transfer to guest mode work */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h

index ae7735b..041ca7f 100644 (file)
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -150,6 +150,7 @@ static inline bool is_error_page(struct page *page)
  #define KVM_REQ_MMU_RELOAD        (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
  #define KVM_REQ_UNBLOCK           2
  #define KVM_REQ_UNHALT            3
+#define KVM_REQ_VM_BUGGED         (4 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
  #define KVM_REQUEST_ARCH_BASE     8
  
  #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \
@@ -158,6 +159,15 @@ static inline bool is_error_page(struct page *page)
  })
  #define KVM_ARCH_REQ(nr)           KVM_ARCH_REQ_FLAGS(nr, 0)
  
+bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
+                                struct kvm_vcpu *except,
+                                unsigned long *vcpu_bitmap, cpumask_var_t tmp);
+bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
+bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
+                                     struct kvm_vcpu *except);
+bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req,
+                               unsigned long *vcpu_bitmap);
+
  #define KVM_USERSPACE_IRQ_SOURCE_ID            0
  #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID       1
  
@@ -344,6 +354,13 @@ struct kvm_vcpu {
         struct kvm_vcpu_stat stat;
         char stats_id[KVM_STATS_NAME_SIZE];
         struct kvm_dirty_ring dirty_ring;
+
+       /*
+        * The index of the most recently used memslot by this vCPU. It's ok
+        * if this becomes stale due to memslot changes since we always check
+        * it is a valid slot.
+        */
+       int last_used_slot;
  };
  
  /* must be called with irqs disabled */
@@ -512,7 +529,7 @@ struct kvm_memslots {
         u64 generation;
         /* The mapping table from slot id to the index in memslots[]. */
         short id_to_index[KVM_MEM_SLOTS_NUM];
-       atomic_t lru_slot;
+       atomic_t last_used_slot;
         int used_slots;
         struct kvm_memory_slot memslots[];
  };
@@ -538,6 +555,11 @@ struct kvm {
         struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
         struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
  
+       /* Used to wait for completion of MMU notifiers.  */
+       spinlock_t mn_invalidate_lock;
+       unsigned long mn_active_invalidate_count;
+       struct rcuwait mn_memslots_update_rcuwait;
+
         /*
          * created_vcpus is protected by kvm->lock, and is incremented
          * at the beginning of KVM_CREATE_VCPU.  online_vcpus is only
@@ -596,6 +618,7 @@ struct kvm {
         pid_t userspace_pid;
         unsigned int max_halt_poll_ns;
         u32 dirty_ring_size;
+       bool vm_bugged;
  
  #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
         struct notifier_block pm_notifier;
@@ -629,6 +652,30 @@ struct kvm {
  #define vcpu_err(vcpu, fmt, ...)                                       \
         kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__)
  
+static inline void kvm_vm_bugged(struct kvm *kvm)
+{
+       kvm->vm_bugged = true;
+       kvm_make_all_cpus_request(kvm, KVM_REQ_VM_BUGGED);
+}
+
+#define KVM_BUG(cond, kvm, fmt...)                             \
+({                                                             \
+       int __ret = (cond);                                     \
+                                                               \
+       if (WARN_ONCE(__ret && !(kvm)->vm_bugged, fmt))         \
+               kvm_vm_bugged(kvm);                             \
+       unlikely(__ret);                                        \
+})
+
+#define KVM_BUG_ON(cond, kvm)                                  \
+({                                                             \
+       int __ret = (cond);                                     \
+                                                               \
+       if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged))           \
+               kvm_vm_bugged(kvm);                             \
+       unlikely(__ret);                                        \
+})
+
  static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm)
  {
         return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET);
@@ -720,6 +767,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
  void kvm_exit(void);
  
  void kvm_get_kvm(struct kvm *kvm);
+bool kvm_get_kvm_safe(struct kvm *kvm);
  void kvm_put_kvm(struct kvm *kvm);
  bool file_is_kvm(struct file *file);
  void kvm_put_kvm_no_destroy(struct kvm *kvm);
@@ -824,7 +872,6 @@ void kvm_release_pfn_clean(kvm_pfn_t pfn);
  void kvm_release_pfn_dirty(kvm_pfn_t pfn);
  void kvm_set_pfn_dirty(kvm_pfn_t pfn);
  void kvm_set_pfn_accessed(kvm_pfn_t pfn);
-void kvm_get_pfn(kvm_pfn_t pfn);
  
  void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache);
  int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
@@ -943,14 +990,10 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
  void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
  #endif
  
-bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
-                                struct kvm_vcpu *except,
-                                unsigned long *vcpu_bitmap, cpumask_var_t tmp);
-bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
-bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
-                                     struct kvm_vcpu *except);
-bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req,
-                               unsigned long *vcpu_bitmap);
+void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
+                                  unsigned long end);
+void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
+                                  unsigned long end);
  
  long kvm_arch_dev_ioctl(struct file *filp,
                         unsigned int ioctl, unsigned long arg);
@@ -1034,6 +1077,7 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu);
  bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
  int kvm_arch_post_init_vm(struct kvm *kvm);
  void kvm_arch_pre_destroy_vm(struct kvm *kvm);
+int kvm_arch_create_vm_debugfs(struct kvm *kvm);
  
  #ifndef __KVM_HAVE_ARCH_VM_ALLOC
  /*
@@ -1157,29 +1201,49 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
  bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
  
  /*
- * search_memslots() and __gfn_to_memslot() are here because they are
- * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.
- * gfn_to_memslot() itself isn't here as an inline because that would
- * bloat other code too much.
+ * Returns a pointer to the memslot at slot_index if it contains gfn.
+ * Otherwise returns NULL.
+ */
+static inline struct kvm_memory_slot *
+try_get_memslot(struct kvm_memslots *slots, int slot_index, gfn_t gfn)
+{
+       struct kvm_memory_slot *slot;
+
+       if (slot_index < 0 || slot_index >= slots->used_slots)
+               return NULL;
+
+       /*
+        * slot_index can come from vcpu->last_used_slot which is not kept
+        * in sync with userspace-controllable memslot deletion. So use nospec
+        * to prevent the CPU from speculating past the end of memslots[].
+        */
+       slot_index = array_index_nospec(slot_index, slots->used_slots);
+       slot = &slots->memslots[slot_index];
+
+       if (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages)
+               return slot;
+       else
+               return NULL;
+}
+
+/*
+ * Returns a pointer to the memslot that contains gfn and records the index of
+ * the slot in index. Otherwise returns NULL.
   *
   * IMPORTANT: Slots are sorted from highest GFN to lowest GFN!
   */
  static inline struct kvm_memory_slot *
-search_memslots(struct kvm_memslots *slots, gfn_t gfn)
+search_memslots(struct kvm_memslots *slots, gfn_t gfn, int *index)
  {
         int start = 0, end = slots->used_slots;
-       int slot = atomic_read(&slots->lru_slot);
         struct kvm_memory_slot *memslots = slots->memslots;
+       struct kvm_memory_slot *slot;
  
         if (unlikely(!slots->used_slots))
                 return NULL;
  
-       if (gfn >= memslots[slot].base_gfn &&
-           gfn < memslots[slot].base_gfn + memslots[slot].npages)
-               return &memslots[slot];
-
         while (start < end) {
-               slot = start + (end - start) / 2;
+               int slot = start + (end - start) / 2;
  
                 if (gfn >= memslots[slot].base_gfn)
                         end = slot;
@@ -1187,19 +1251,37 @@ search_memslots(struct kvm_memslots *slots, gfn_t gfn)
                         start = slot + 1;
         }
  
-       if (start < slots->used_slots && gfn >= memslots[start].base_gfn &&
-           gfn < memslots[start].base_gfn + memslots[start].npages) {
-               atomic_set(&slots->lru_slot, start);
-               return &memslots[start];
+       slot = try_get_memslot(slots, start, gfn);
+       if (slot) {
+               *index = start;
+               return slot;
         }
  
         return NULL;
  }
  
+/*
+ * __gfn_to_memslot() and its descendants are here because it is called from
+ * non-modular code in arch/powerpc/kvm/book3s_64_vio{,_hv}.c. gfn_to_memslot()
+ * itself isn't here as an inline because that would bloat other code too much.
+ */
  static inline struct kvm_memory_slot *
  __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
  {
-       return search_memslots(slots, gfn);
+       struct kvm_memory_slot *slot;
+       int slot_index = atomic_read(&slots->last_used_slot);
+
+       slot = try_get_memslot(slots, slot_index, gfn);
+       if (slot)
+               return slot;
+
+       slot = search_memslots(slots, gfn, &slot_index);
+       if (slot) {
+               atomic_set(&slots->last_used_slot, slot_index);
+               return slot;
+       }
+
+       return NULL;
  }
  
  static inline unsigned long
@@ -1273,56 +1355,66 @@ struct _kvm_stats_desc {
         char name[KVM_STATS_NAME_SIZE];
  };
  
-#define STATS_DESC_COMMON(type, unit, base, exp)                              \
+#define STATS_DESC_COMMON(type, unit, base, exp, sz, bsz)                     \
         .flags = type | unit | base |                                          \
                  BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) |              \
                  BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) |              \
                  BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK),               \
         .exponent = exp,                                                       \
-       .size = 1
+       .size = sz,                                                            \
+       .bucket_size = bsz
  
-#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp)                    \
+#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz)           \
         {                                                                      \
                 {                                                              \
-                       STATS_DESC_COMMON(type, unit, base, exp),              \
+                       STATS_DESC_COMMON(type, unit, base, exp, sz, bsz),     \
                         .offset = offsetof(struct kvm_vm_stat, generic.stat)   \
                 },                                                             \
                 .name = #stat,                                                 \
         }
-#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp)                  \
+#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz)         \
         {                                                                      \
                 {                                                              \
-                       STATS_DESC_COMMON(type, unit, base, exp),              \
+                       STATS_DESC_COMMON(type, unit, base, exp, sz, bsz),     \
                         .offset = offsetof(struct kvm_vcpu_stat, generic.stat) \
                 },                                                             \
                 .name = #stat,                                                 \
         }
-#define VM_STATS_DESC(stat, type, unit, base, exp)                            \
+#define VM_STATS_DESC(stat, type, unit, base, exp, sz, bsz)                   \
         {                                                                      \
                 {                                                              \
-                       STATS_DESC_COMMON(type, unit, base, exp),              \
+                       STATS_DESC_COMMON(type, unit, base, exp, sz, bsz),     \
                         .offset = offsetof(struct kvm_vm_stat, stat)           \
                 },                                                             \
                 .name = #stat,                                                 \
         }
-#define VCPU_STATS_DESC(stat, type, unit, base, exp)                          \
+#define VCPU_STATS_DESC(stat, type, unit, base, exp, sz, bsz)                 \
         {                                                                      \
                 {                                                              \
-                       STATS_DESC_COMMON(type, unit, base, exp),              \
+                       STATS_DESC_COMMON(type, unit, base, exp, sz, bsz),     \
                         .offset = offsetof(struct kvm_vcpu_stat, stat)         \
                 },                                                             \
                 .name = #stat,                                                 \
         }
  /* SCOPE: VM, VM_GENERIC, VCPU, VCPU_GENERIC */
-#define STATS_DESC(SCOPE, stat, type, unit, base, exp)                        \
-       SCOPE##_STATS_DESC(stat, type, unit, base, exp)
+#define STATS_DESC(SCOPE, stat, type, unit, base, exp, sz, bsz)                       \
+       SCOPE##_STATS_DESC(stat, type, unit, base, exp, sz, bsz)
  
  #define STATS_DESC_CUMULATIVE(SCOPE, name, unit, base, exponent)              \
-       STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE, unit, base, exponent)
+       STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE,                     \
+               unit, base, exponent, 1, 0)
  #define STATS_DESC_INSTANT(SCOPE, name, unit, base, exponent)                 \
-       STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT, unit, base, exponent)
+       STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT,                        \
+               unit, base, exponent, 1, 0)
  #define STATS_DESC_PEAK(SCOPE, name, unit, base, exponent)                    \
-       STATS_DESC(SCOPE, name, KVM_STATS_TYPE_PEAK, unit, base, exponent)
+       STATS_DESC(SCOPE, name, KVM_STATS_TYPE_PEAK,                           \
+               unit, base, exponent, 1, 0)
+#define STATS_DESC_LINEAR_HIST(SCOPE, name, unit, base, exponent, sz, bsz)     \
+       STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LINEAR_HIST,                    \
+               unit, base, exponent, sz, bsz)
+#define STATS_DESC_LOG_HIST(SCOPE, name, unit, base, exponent, sz)            \
+       STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LOG_HIST,                       \
+               unit, base, exponent, sz, 0)
  
  /* Cumulative counter, read/write */
  #define STATS_DESC_COUNTER(SCOPE, name)                                               \
@@ -1341,9 +1433,18 @@ struct _kvm_stats_desc {
  #define STATS_DESC_TIME_NSEC(SCOPE, name)                                     \
         STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS,             \
                 KVM_STATS_BASE_POW10, -9)
+/* Linear histogram for time in nanosecond */
+#define STATS_DESC_LINHIST_TIME_NSEC(SCOPE, name, sz, bsz)                    \
+       STATS_DESC_LINEAR_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS,            \
+               KVM_STATS_BASE_POW10, -9, sz, bsz)
+/* Logarithmic histogram for time in nanosecond */
+#define STATS_DESC_LOGHIST_TIME_NSEC(SCOPE, name, sz)                         \
+       STATS_DESC_LOG_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS,               \
+               KVM_STATS_BASE_POW10, -9, sz)
  
  #define KVM_GENERIC_VM_STATS()                                                \
-       STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush)
+       STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush),                      \
+       STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush_requests)
  
  #define KVM_GENERIC_VCPU_STATS()                                              \
         STATS_DESC_COUNTER(VCPU_GENERIC, halt_successful_poll),                \
@@ -1351,13 +1452,62 @@ struct _kvm_stats_desc {
         STATS_DESC_COUNTER(VCPU_GENERIC, halt_poll_invalid),                   \
         STATS_DESC_COUNTER(VCPU_GENERIC, halt_wakeup),                         \
         STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_success_ns),              \
-       STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns)
+       STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns),                 \
+       STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_wait_ns),                      \
+       STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_success_hist,     \
+                       HALT_POLL_HIST_COUNT),                                 \
+       STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_hist,        \
+                       HALT_POLL_HIST_COUNT),                                 \
+       STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_wait_hist,             \
+                       HALT_POLL_HIST_COUNT)
  
  extern struct dentry *kvm_debugfs_dir;
+
  ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header,
                        const struct _kvm_stats_desc *desc,
                        void *stats, size_t size_stats,
                        char __user *user_buffer, size_t size, loff_t *offset);
+
+/**
+ * kvm_stats_linear_hist_update() - Update bucket value for linear histogram
+ * statistics data.
+ *
+ * @data: start address of the stats data
+ * @size: the number of bucket of the stats data
+ * @value: the new value used to update the linear histogram's bucket
+ * @bucket_size: the size (width) of a bucket
+ */
+static inline void kvm_stats_linear_hist_update(u64 *data, size_t size,
+                                               u64 value, size_t bucket_size)
+{
+       size_t index = div64_u64(value, bucket_size);
+
+       index = min(index, size - 1);
+       ++data[index];
+}
+
+/**
+ * kvm_stats_log_hist_update() - Update bucket value for logarithmic histogram
+ * statistics data.
+ *
+ * @data: start address of the stats data
+ * @size: the number of bucket of the stats data
+ * @value: the new value used to update the logarithmic histogram's bucket
+ */
+static inline void kvm_stats_log_hist_update(u64 *data, size_t size, u64 value)
+{
+       size_t index = fls64(value);
+
+       index = min(index, size - 1);
+       ++data[index];
+}
+
+#define KVM_STATS_LINEAR_HIST_UPDATE(array, value, bsize)                     \
+       kvm_stats_linear_hist_update(array, ARRAY_SIZE(array), value, bsize)
+#define KVM_STATS_LOG_HIST_UPDATE(array, value)                                       \
+       kvm_stats_log_hist_update(array, ARRAY_SIZE(array), value)
+
+
  extern const struct kvm_stats_header kvm_vm_stats_header;
  extern const struct _kvm_stats_desc kvm_vm_stats_desc[];
  extern const struct kvm_stats_header kvm_vcpu_stats_header;
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h

index ed6a985..2237abb 100644 (file)
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -76,8 +76,11 @@ struct kvm_mmu_memory_cache {
  };
  #endif
  
+#define HALT_POLL_HIST_COUNT                   32
+
  struct kvm_vm_stat_generic {
         u64 remote_tlb_flush;
+       u64 remote_tlb_flush_requests;
  };
  
  struct kvm_vcpu_stat_generic {
@@ -87,6 +90,10 @@ struct kvm_vcpu_stat_generic {
         u64 halt_wakeup;
         u64 halt_poll_success_ns;
         u64 halt_poll_fail_ns;
+       u64 halt_wait_ns;
+       u64 halt_poll_success_hist[HALT_POLL_HIST_COUNT];
+       u64 halt_poll_fail_hist[HALT_POLL_HIST_COUNT];
+       u64 halt_wait_hist[HALT_POLL_HIST_COUNT];
  };
  
  #define KVM_STATS_NAME_SIZE    48
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h

index 5922031..1ace27c 100644 (file)
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -632,43 +632,6 @@ static inline int PageTransCompound(struct page *page)
         return PageCompound(page);
  }
  
-/*
- * PageTransCompoundMap is the same as PageTransCompound, but it also
- * guarantees the primary MMU has the entire compound page mapped
- * through pmd_trans_huge, which in turn guarantees the secondary MMUs
- * can also map the entire compound page. This allows the secondary
- * MMUs to call get_user_pages() only once for each compound page and
- * to immediately map the entire compound page with a single secondary
- * MMU fault. If there will be a pmd split later, the secondary MMUs
- * will get an update through the MMU notifier invalidation through
- * split_huge_pmd().
- *
- * Unlike PageTransCompound, this is safe to be called only while
- * split_huge_pmd() cannot run from under us, like if protected by the
- * MMU notifier, otherwise it may result in page->_mapcount check false
- * positives.
- *
- * We have to treat page cache THP differently since every subpage of it
- * would get _mapcount inc'ed once it is PMD mapped.  But, it may be PTE
- * mapped in the current process so comparing subpage's _mapcount to
- * compound_mapcount to filter out PTE mapped case.
- */
-static inline int PageTransCompoundMap(struct page *page)
-{
-       struct page *head;
-
-       if (!PageTransCompound(page))
-               return 0;
-
-       if (PageAnon(page))
-               return atomic_read(&page->_mapcount) < 0;
-
-       head = compound_head(page);
-       /* File THP is PMD mapped and not PTE mapped */
-       return atomic_read(&page->_mapcount) ==
-              atomic_read(compound_mapcount_ptr(head));
-}
-
  /*
   * PageTransTail returns true for both transparent huge pages
   * and hugetlbfs pages, so it should only be called when it's known
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h

index d9e4aab..a067410 100644 (file)
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1965,7 +1965,9 @@ struct kvm_stats_header {
  #define KVM_STATS_TYPE_CUMULATIVE      (0x0 << KVM_STATS_TYPE_SHIFT)
  #define KVM_STATS_TYPE_INSTANT         (0x1 << KVM_STATS_TYPE_SHIFT)
  #define KVM_STATS_TYPE_PEAK            (0x2 << KVM_STATS_TYPE_SHIFT)
-#define KVM_STATS_TYPE_MAX             KVM_STATS_TYPE_PEAK
+#define KVM_STATS_TYPE_LINEAR_HIST     (0x3 << KVM_STATS_TYPE_SHIFT)
+#define KVM_STATS_TYPE_LOG_HIST                (0x4 << KVM_STATS_TYPE_SHIFT)
+#define KVM_STATS_TYPE_MAX             KVM_STATS_TYPE_LOG_HIST
  
  #define KVM_STATS_UNIT_SHIFT           4
  #define KVM_STATS_UNIT_MASK            (0xF << KVM_STATS_UNIT_SHIFT)
@@ -1988,8 +1990,9 @@ struct kvm_stats_header {
   * @size: The number of data items for this stats.
   *        Every data item is of type __u64.
   * @offset: The offset of the stats to the start of stat structure in
- *          struture kvm or kvm_vcpu.
- * @unused: Unused field for future usage. Always 0 for now.
+ *          structure kvm or kvm_vcpu.
+ * @bucket_size: A parameter value used for histogram stats. It is only used
+ *             for linear histogram stats, specifying the size of the bucket;
   * @name: The name string for the stats. Its size is indicated by the
   *        &kvm_stats_header->name_size.
   */
@@ -1998,7 +2001,7 @@ struct kvm_stats_desc {
         __s16 exponent;
         __u16 size;
         __u32 offset;
-       __u32 unused;
+       __u32 bucket_size;
         char name[];
  };
  
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore

index 0709af0..98053d3 100644 (file)
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -1,6 +1,7 @@
  # SPDX-License-Identifier: GPL-2.0-only
  /aarch64/debug-exceptions
  /aarch64/get-reg-list
+/aarch64/psci_cpu_on_test
  /aarch64/vgic_init
  /s390x/memop
  /s390x/resets
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile

index 5832f51..5d05801 100644 (file)
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -86,6 +86,7 @@ TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test
  
  TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
  TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
+TEST_GEN_PROGS_aarch64 += aarch64/psci_cpu_on_test
  TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
  TEST_GEN_PROGS_aarch64 += demand_paging_test
  TEST_GEN_PROGS_aarch64 += dirty_log_test
diff --git a/tools/testing/selftests/kvm/aarch64/psci_cpu_on_test.c b/tools/testing/selftests/kvm/aarch64/psci_cpu_on_test.c

new file mode 100644 (file)

index 0000000..018c269
--- /dev/null
+++ b/tools/testing/selftests/kvm/aarch64/psci_cpu_on_test.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * psci_cpu_on_test - Test that the observable state of a vCPU targeted by the
+ * CPU_ON PSCI call matches what the caller requested.
+ *
+ * Copyright (c) 2021 Google LLC.
+ *
+ * This is a regression test for a race between KVM servicing the PSCI call and
+ * userspace reading the vCPUs registers.
+ */
+
+#define _GNU_SOURCE
+
+#include <linux/psci.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+#define VCPU_ID_SOURCE 0
+#define VCPU_ID_TARGET 1
+
+#define CPU_ON_ENTRY_ADDR 0xfeedf00dul
+#define CPU_ON_CONTEXT_ID 0xdeadc0deul
+
+static uint64_t psci_cpu_on(uint64_t target_cpu, uint64_t entry_addr,
+                           uint64_t context_id)
+{
+       register uint64_t x0 asm("x0") = PSCI_0_2_FN64_CPU_ON;
+       register uint64_t x1 asm("x1") = target_cpu;
+       register uint64_t x2 asm("x2") = entry_addr;
+       register uint64_t x3 asm("x3") = context_id;
+
+       asm("hvc #0"
+           : "=r"(x0)
+           : "r"(x0), "r"(x1), "r"(x2), "r"(x3)
+           : "memory");
+
+       return x0;
+}
+
+static uint64_t psci_affinity_info(uint64_t target_affinity,
+                                  uint64_t lowest_affinity_level)
+{
+       register uint64_t x0 asm("x0") = PSCI_0_2_FN64_AFFINITY_INFO;
+       register uint64_t x1 asm("x1") = target_affinity;
+       register uint64_t x2 asm("x2") = lowest_affinity_level;
+
+       asm("hvc #0"
+           : "=r"(x0)
+           : "r"(x0), "r"(x1), "r"(x2)
+           : "memory");
+
+       return x0;
+}
+
+static void guest_main(uint64_t target_cpu)
+{
+       GUEST_ASSERT(!psci_cpu_on(target_cpu, CPU_ON_ENTRY_ADDR, CPU_ON_CONTEXT_ID));
+       uint64_t target_state;
+
+       do {
+               target_state = psci_affinity_info(target_cpu, 0);
+
+               GUEST_ASSERT((target_state == PSCI_0_2_AFFINITY_LEVEL_ON) ||
+                            (target_state == PSCI_0_2_AFFINITY_LEVEL_OFF));
+       } while (target_state != PSCI_0_2_AFFINITY_LEVEL_ON);
+
+       GUEST_DONE();
+}
+
+int main(void)
+{
+       uint64_t target_mpidr, obs_pc, obs_x0;
+       struct kvm_vcpu_init init;
+       struct kvm_vm *vm;
+       struct ucall uc;
+
+       vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR);
+       kvm_vm_elf_load(vm, program_invocation_name);
+       ucall_init(vm, NULL);
+
+       vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init);
+       init.features[0] |= (1 << KVM_ARM_VCPU_PSCI_0_2);
+
+       aarch64_vcpu_add_default(vm, VCPU_ID_SOURCE, &init, guest_main);
+
+       /*
+        * make sure the target is already off when executing the test.
+        */
+       init.features[0] |= (1 << KVM_ARM_VCPU_POWER_OFF);
+       aarch64_vcpu_add_default(vm, VCPU_ID_TARGET, &init, guest_main);
+
+       get_reg(vm, VCPU_ID_TARGET, ARM64_SYS_REG(MPIDR_EL1), &target_mpidr);
+       vcpu_args_set(vm, VCPU_ID_SOURCE, 1, target_mpidr & MPIDR_HWID_BITMASK);
+       vcpu_run(vm, VCPU_ID_SOURCE);
+
+       switch (get_ucall(vm, VCPU_ID_SOURCE, &uc)) {
+       case UCALL_DONE:
+               break;
+       case UCALL_ABORT:
+               TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__,
+                         uc.args[1]);
+               break;
+       default:
+               TEST_FAIL("Unhandled ucall: %lu", uc.cmd);
+       }
+
+       get_reg(vm, VCPU_ID_TARGET, ARM64_CORE_REG(regs.pc), &obs_pc);
+       get_reg(vm, VCPU_ID_TARGET, ARM64_CORE_REG(regs.regs[0]), &obs_x0);
+
+       TEST_ASSERT(obs_pc == CPU_ON_ENTRY_ADDR,
+                   "unexpected target cpu pc: %lx (expected: %lx)",
+                   obs_pc, CPU_ON_ENTRY_ADDR);
+       TEST_ASSERT(obs_x0 == CPU_ON_CONTEXT_ID,
+                   "unexpected target context id: %lx (expected: %lx)",
+                   obs_x0, CPU_ON_CONTEXT_ID);
+
+       kvm_vm_free(vm);
+       return 0;
+}
diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c

index e2baa18..71e277c 100644 (file)
--- a/tools/testing/selftests/kvm/access_tracking_perf_test.c
+++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c
@@ -222,8 +222,6 @@ static void *vcpu_thread_main(void *arg)
         int vcpu_id = vcpu_args->vcpu_id;
         int current_iteration = -1;
  
-       vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
-
         while (spin_wait_for_next_iteration(&current_iteration)) {
                 switch (READ_ONCE(iteration_work)) {
                 case ITERATION_ACCESS_MEMORY:
@@ -333,7 +331,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
         pthread_t *vcpu_threads;
         int vcpus = params->vcpus;
  
-       vm = perf_test_create_vm(mode, vcpus, params->vcpu_memory_bytes,
+       vm = perf_test_create_vm(mode, vcpus, params->vcpu_memory_bytes, 1,
                                  params->backing_src);
  
         perf_test_setup_vcpus(vm, vcpus, params->vcpu_memory_bytes,
diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c

index b747043..e79c1b6 100644 (file)
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -52,7 +52,6 @@ static void *vcpu_worker(void *data)
         struct timespec start;
         struct timespec ts_diff;
  
-       vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
         run = vcpu_state(vm, vcpu_id);
  
         clock_gettime(CLOCK_MONOTONIC, &start);
@@ -293,7 +292,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
         int vcpu_id;
         int r;
  
-       vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size,
+       vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1,
                                  p->src_type);
  
         perf_test_args.wr_fract = 1;
diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c

index 80cbd3a..3c30d00 100644 (file)
--- a/tools/testing/selftests/kvm/dirty_log_perf_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c
@@ -44,7 +44,6 @@ static void *vcpu_worker(void *data)
         struct perf_test_vcpu_args *vcpu_args = (struct perf_test_vcpu_args *)data;
         int vcpu_id = vcpu_args->vcpu_id;
  
-       vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
         run = vcpu_state(vm, vcpu_id);
  
         while (!READ_ONCE(host_quit)) {
@@ -94,8 +93,59 @@ struct test_params {
         int wr_fract;
         bool partition_vcpu_memory_access;
         enum vm_mem_backing_src_type backing_src;
+       int slots;
  };
  
+static void toggle_dirty_logging(struct kvm_vm *vm, int slots, bool enable)
+{
+       int i;
+
+       for (i = 0; i < slots; i++) {
+               int slot = PERF_TEST_MEM_SLOT_INDEX + i;
+               int flags = enable ? KVM_MEM_LOG_DIRTY_PAGES : 0;
+
+               vm_mem_region_set_flags(vm, slot, flags);
+       }
+}
+
+static inline void enable_dirty_logging(struct kvm_vm *vm, int slots)
+{
+       toggle_dirty_logging(vm, slots, true);
+}
+
+static inline void disable_dirty_logging(struct kvm_vm *vm, int slots)
+{
+       toggle_dirty_logging(vm, slots, false);
+}
+
+static void get_dirty_log(struct kvm_vm *vm, int slots, unsigned long *bitmap,
+                         uint64_t nr_pages)
+{
+       uint64_t slot_pages = nr_pages / slots;
+       int i;
+
+       for (i = 0; i < slots; i++) {
+               int slot = PERF_TEST_MEM_SLOT_INDEX + i;
+               unsigned long *slot_bitmap = bitmap + i * slot_pages;
+
+               kvm_vm_get_dirty_log(vm, slot, slot_bitmap);
+       }
+}
+
+static void clear_dirty_log(struct kvm_vm *vm, int slots, unsigned long *bitmap,
+                           uint64_t nr_pages)
+{
+       uint64_t slot_pages = nr_pages / slots;
+       int i;
+
+       for (i = 0; i < slots; i++) {
+               int slot = PERF_TEST_MEM_SLOT_INDEX + i;
+               unsigned long *slot_bitmap = bitmap + i * slot_pages;
+
+               kvm_vm_clear_dirty_log(vm, slot, slot_bitmap, 0, slot_pages);
+       }
+}
+
  static void run_test(enum vm_guest_mode mode, void *arg)
  {
         struct test_params *p = arg;
@@ -114,7 +164,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
         struct timespec clear_dirty_log_total = (struct timespec){0};
  
         vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size,
-                                p->backing_src);
+                                p->slots, p->backing_src);
  
         perf_test_args.wr_fract = p->wr_fract;
  
@@ -163,8 +213,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
  
         /* Enable dirty logging */
         clock_gettime(CLOCK_MONOTONIC, &start);
-       vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX,
-                               KVM_MEM_LOG_DIRTY_PAGES);
+       enable_dirty_logging(vm, p->slots);
         ts_diff = timespec_elapsed(start);
         pr_info("Enabling dirty logging time: %ld.%.9lds\n\n",
                 ts_diff.tv_sec, ts_diff.tv_nsec);
@@ -190,8 +239,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
                         iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
  
                 clock_gettime(CLOCK_MONOTONIC, &start);
-               kvm_vm_get_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap);
-
+               get_dirty_log(vm, p->slots, bmap, host_num_pages);
                 ts_diff = timespec_elapsed(start);
                 get_dirty_log_total = timespec_add(get_dirty_log_total,
                                                    ts_diff);
@@ -200,9 +248,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
  
                 if (dirty_log_manual_caps) {
                         clock_gettime(CLOCK_MONOTONIC, &start);
-                       kvm_vm_clear_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap, 0,
-                                              host_num_pages);
-
+                       clear_dirty_log(vm, p->slots, bmap, host_num_pages);
                         ts_diff = timespec_elapsed(start);
                         clear_dirty_log_total = timespec_add(clear_dirty_log_total,
                                                              ts_diff);
@@ -213,7 +259,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
  
         /* Disable dirty logging */
         clock_gettime(CLOCK_MONOTONIC, &start);
-       vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX, 0);
+       disable_dirty_logging(vm, p->slots);
         ts_diff = timespec_elapsed(start);
         pr_info("Disabling dirty logging time: %ld.%.9lds\n",
                 ts_diff.tv_sec, ts_diff.tv_nsec);
@@ -244,7 +290,8 @@ static void help(char *name)
  {
         puts("");
         printf("usage: %s [-h] [-i iterations] [-p offset] "
-              "[-m mode] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]\n", name);
+              "[-m mode] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]"
+              "[-x memslots]\n", name);
         puts("");
         printf(" -i: specify iteration counts (default: %"PRIu64")\n",
                TEST_HOST_LOOP_N);
@@ -263,6 +310,8 @@ static void help(char *name)
                "     them into a separate region of memory for each vCPU.\n");
         printf(" -s: specify the type of memory that should be used to\n"
                "     back the guest data region.\n\n");
+       printf(" -x: Split the memory region into this number of memslots.\n"
+              "     (default: 1)");
         backing_src_help();
         puts("");
         exit(0);
@@ -276,6 +325,7 @@ int main(int argc, char *argv[])
                 .wr_fract = 1,
                 .partition_vcpu_memory_access = true,
                 .backing_src = VM_MEM_SRC_ANONYMOUS,
+               .slots = 1,
         };
         int opt;
  
@@ -286,7 +336,7 @@ int main(int argc, char *argv[])
  
         guest_modes_append_default();
  
-       while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:os:")) != -1) {
+       while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:os:x:")) != -1) {
                 switch (opt) {
                 case 'i':
                         p.iterations = atoi(optarg);
@@ -316,6 +366,9 @@ int main(int argc, char *argv[])
                 case 's':
                         p.backing_src = parse_backing_src_type(optarg);
                         break;
+               case 'x':
+                       p.slots = atoi(optarg);
+                       break;
                 case 'h':
                 default:
                         help(argv[0]);
diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h

index 27dc5c2..c0273ae 100644 (file)
--- a/tools/testing/selftests/kvm/include/aarch64/processor.h
+++ b/tools/testing/selftests/kvm/include/aarch64/processor.h
@@ -17,6 +17,7 @@
  #define CPACR_EL1               3, 0,  1, 0, 2
  #define TCR_EL1                 3, 0,  2, 0, 2
  #define MAIR_EL1                3, 0, 10, 2, 0
+#define MPIDR_EL1               3, 0,  0, 0, 5
  #define TTBR0_EL1               3, 0,  2, 0, 0
  #define SCTLR_EL1               3, 0,  1, 0, 0
  #define VBAR_EL1                3, 0, 12, 0, 0
@@ -40,6 +41,8 @@
                           (0xfful << (4 * 8)) | \
                           (0xbbul << (5 * 8)))
  
+#define MPIDR_HWID_BITMASK (0xff00fffffful)
+
  static inline void get_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint64_t *addr)
  {
         struct kvm_one_reg reg;
diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h

index 005f214..df9f1a3 100644 (file)
--- a/tools/testing/selftests/kvm/include/perf_test_util.h
+++ b/tools/testing/selftests/kvm/include/perf_test_util.h
@@ -44,7 +44,7 @@ extern struct perf_test_args perf_test_args;
  extern uint64_t guest_test_phys_mem;
  
  struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
-                                  uint64_t vcpu_memory_bytes,
+                                  uint64_t vcpu_memory_bytes, int slots,
                                    enum vm_mem_backing_src_type backing_src);
  void perf_test_destroy_vm(struct kvm_vm *vm);
  void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus,
diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c

index 5906bbc..17f65d5 100644 (file)
--- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c
+++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c
@@ -109,6 +109,18 @@ static void stats_test(int stats_fd)
                 /* Check size field, which should not be zero */
                 TEST_ASSERT(pdesc->size, "KVM descriptor(%s) with size of 0",
                                 pdesc->name);
+               /* Check bucket_size field */
+               switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
+               case KVM_STATS_TYPE_LINEAR_HIST:
+                       TEST_ASSERT(pdesc->bucket_size,
+                           "Bucket size of Linear Histogram stats (%s) is zero",
+                           pdesc->name);
+                       break;
+               default:
+                       TEST_ASSERT(!pdesc->bucket_size,
+                           "Bucket size of stats (%s) is not zero",
+                           pdesc->name);
+               }
                 size_data += pdesc->size * sizeof(*stats_data);
         }
         /* Check overlap */
diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c

index b488f4a..0ef80db 100644 (file)
--- a/tools/testing/selftests/kvm/lib/perf_test_util.c
+++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
@@ -50,11 +50,12 @@ static void guest_code(uint32_t vcpu_id)
  }
  
  struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
-                                  uint64_t vcpu_memory_bytes,
+                                  uint64_t vcpu_memory_bytes, int slots,
                                    enum vm_mem_backing_src_type backing_src)
  {
         struct kvm_vm *vm;
         uint64_t guest_num_pages;
+       int i;
  
         pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
  
@@ -68,6 +69,9 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
                     "Guest memory size is not host page size aligned.");
         TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0,
                     "Guest memory size is not guest page size aligned.");
+       TEST_ASSERT(guest_num_pages % slots == 0,
+                   "Guest memory cannot be evenly divided into %d slots.",
+                   slots);
  
         vm = vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES,
                                   (vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size,
@@ -95,10 +99,16 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
  #endif
         pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem);
  
-       /* Add an extra memory slot for testing */
-       vm_userspace_mem_region_add(vm, backing_src, guest_test_phys_mem,
-                                   PERF_TEST_MEM_SLOT_INDEX,
-                                   guest_num_pages, 0);
+       /* Add extra memory slots for testing */
+       for (i = 0; i < slots; i++) {
+               uint64_t region_pages = guest_num_pages / slots;
+               vm_paddr_t region_start = guest_test_phys_mem +
+                       region_pages * perf_test_args.guest_page_size * i;
+
+               vm_userspace_mem_region_add(vm, backing_src, region_start,
+                                           PERF_TEST_MEM_SLOT_INDEX + i,
+                                           region_pages, 0);
+       }
  
         /* Do mapping for the demand paging memory slot */
         virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages);
@@ -140,6 +150,8 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus,
                         vcpu_gpa = guest_test_phys_mem;
                 }
  
+               vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
+
                 pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n",
                          vcpu_id, vcpu_gpa, vcpu_gpa +
                          (vcpu_args->pages * perf_test_args.guest_page_size));
diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c

index 98351ba..4cfcafe 100644 (file)
--- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c
+++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
@@ -45,7 +45,6 @@ static void *vcpu_worker(void *data)
         struct kvm_vm *vm = perf_test_args.vm;
         struct kvm_run *run;
  
-       vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
         run = vcpu_state(vm, vcpu_id);
  
         /* Let the guest access its memory until a stop signal is received */
@@ -105,7 +104,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
         struct kvm_vm *vm;
         int vcpu_id;
  
-       vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size,
+       vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1,
                                  VM_MEM_SRC_ANONYMOUS);
  
         perf_test_args.wr_fract = 1;
diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c

index 6097a82..5f078db 100644 (file)
--- a/tools/testing/selftests/kvm/x86_64/debug_regs.c
+++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c
@@ -8,12 +8,15 @@
  #include <string.h>
  #include "kvm_util.h"
  #include "processor.h"
+#include "apic.h"
  
  #define VCPU_ID 0
  
  #define DR6_BD         (1 << 13)
  #define DR7_GD         (1 << 13)
  
+#define IRQ_VECTOR 0xAA
+
  /* For testing data access debug BP */
  uint32_t guest_value;
  
@@ -21,6 +24,11 @@ extern unsigned char sw_bp, hw_bp, write_data, ss_start, bd_start;
  
  static void guest_code(void)
  {
+       /* Create a pending interrupt on current vCPU */
+       x2apic_enable();
+       x2apic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_INT_ASSERT |
+                        APIC_DM_FIXED | IRQ_VECTOR);
+
         /*
          * Software BP tests.
          *
@@ -38,12 +46,19 @@ static void guest_code(void)
                      "mov %%rax,%0;\n\t write_data:"
                      : "=m" (guest_value) : : "rax");
  
-       /* Single step test, covers 2 basic instructions and 2 emulated */
+       /*
+        * Single step test, covers 2 basic instructions and 2 emulated
+        *
+        * Enable interrupts during the single stepping to see that
+        * pending interrupt we raised is not handled due to KVM_GUESTDBG_BLOCKIRQ
+        */
         asm volatile("ss_start: "
+                    "sti\n\t"
                      "xor %%eax,%%eax\n\t"
                      "cpuid\n\t"
                      "movl $0x1a0,%%ecx\n\t"
                      "rdmsr\n\t"
+                    "cli\n\t"
                      : : : "eax", "ebx", "ecx", "edx");
  
         /* DR6.BD test */
@@ -72,11 +87,13 @@ int main(void)
         uint64_t cmd;
         int i;
         /* Instruction lengths starting at ss_start */
-       int ss_size[4] = {
+       int ss_size[6] = {
+               1,              /* sti*/
                 2,              /* xor */
                 2,              /* cpuid */
                 5,              /* mov */
                 2,              /* rdmsr */
+               1,              /* cli */
         };
  
         if (!kvm_check_cap(KVM_CAP_SET_GUEST_DEBUG)) {
@@ -154,7 +171,8 @@ int main(void)
         for (i = 0; i < (sizeof(ss_size) / sizeof(ss_size[0])); i++) {
                 target_rip += ss_size[i];
                 CLEAR_DEBUG();
-               debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
+               debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP |
+                               KVM_GUESTDBG_BLOCKIRQ;
                 debug.arch.debugreg[7] = 0x00000400;
                 APPLY_DEBUG();
                 vcpu_run(vm, VCPU_ID);
diff --git a/virt/kvm/binary_stats.c b/virt/kvm/binary_stats.c

index e609d42..eefca6c 100644 (file)
--- a/virt/kvm/binary_stats.c
+++ b/virt/kvm/binary_stats.c
@@ -136,9 +136,7 @@ ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header,
                 src = stats + pos - header->data_offset;
                 if (copy_to_user(dest, src, copylen))
                         return -EFAULT;
-               remain -= copylen;
                 pos += copylen;
-               dest += copylen;
         }
  
         *offset = pos;
diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c

index 7aafefc..88f4683 100644 (file)
--- a/virt/kvm/dirty_ring.c
+++ b/virt/kvm/dirty_ring.c
@@ -91,11 +91,6 @@ static inline void kvm_dirty_gfn_set_dirtied(struct kvm_dirty_gfn *gfn)
         gfn->flags = KVM_DIRTY_GFN_F_DIRTY;
  }
  
-static inline bool kvm_dirty_gfn_invalid(struct kvm_dirty_gfn *gfn)
-{
-       return gfn->flags == 0;
-}
-
  static inline bool kvm_dirty_gfn_harvested(struct kvm_dirty_gfn *gfn)
  {
         return gfn->flags & KVM_DIRTY_GFN_F_RESET;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index b50dbe2..439d3b4 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -189,16 +189,6 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
         return true;
  }
  
-bool kvm_is_transparent_hugepage(kvm_pfn_t pfn)
-{
-       struct page *page = pfn_to_page(pfn);
-
-       if (!PageTransCompoundMap(page))
-               return false;
-
-       return is_transparent_hugepage(compound_head(page));
-}
-
  /*
   * Switches to specified vcpu, until a matching vcpu_put()
   */
@@ -318,6 +308,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
          */
         long dirty_count = smp_load_acquire(&kvm->tlbs_dirty);
  
+       ++kvm->stat.generic.remote_tlb_flush_requests;
         /*
          * We want to publish modifications to the page tables before reading
          * mode. Pairs with a memory barrier in arch-specific code.
@@ -415,6 +406,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
         vcpu->preempted = false;
         vcpu->ready = false;
         preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
+       vcpu->last_used_slot = 0;
  }
  
  void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -496,17 +488,6 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
  
         idx = srcu_read_lock(&kvm->srcu);
  
-       /* The on_lock() path does not yet support lock elision. */
-       if (!IS_KVM_NULL_FN(range->on_lock)) {
-               locked = true;
-               KVM_MMU_LOCK(kvm);
-
-               range->on_lock(kvm, range->start, range->end);
-
-               if (IS_KVM_NULL_FN(range->handler))
-                       goto out_unlock;
-       }
-
         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
                 slots = __kvm_memslots(kvm, i);
                 kvm_for_each_memslot(slot, slots) {
@@ -538,6 +519,10 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
                         if (!locked) {
                                 locked = true;
                                 KVM_MMU_LOCK(kvm);
+                               if (!IS_KVM_NULL_FN(range->on_lock))
+                                       range->on_lock(kvm, range->start, range->end);
+                               if (IS_KVM_NULL_FN(range->handler))
+                                       break;
                         }
                         ret |= range->handler(kvm, &gfn_range);
                 }
@@ -546,7 +531,6 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
         if (range->flush_on_ret && (ret || kvm->tlbs_dirty))
                 kvm_flush_remote_tlbs(kvm);
  
-out_unlock:
         if (locked)
                 KVM_MMU_UNLOCK(kvm);
  
@@ -604,16 +588,20 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
         trace_kvm_set_spte_hva(address);
  
         /*
-        * .change_pte() must be surrounded by .invalidate_range_{start,end}(),
-        * and so always runs with an elevated notifier count.  This obviates
-        * the need to bump the sequence count.
+        * .change_pte() must be surrounded by .invalidate_range_{start,end}().
+        * If mmu_notifier_count is zero, then no in-progress invalidations,
+        * including this one, found a relevant memslot at start(); rechecking
+        * memslots here is unnecessary.  Note, a false positive (count elevated
+        * by a different invalidation) is sub-optimal but functionally ok.
          */
-       WARN_ON_ONCE(!kvm->mmu_notifier_count);
+       WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
+       if (!READ_ONCE(kvm->mmu_notifier_count))
+               return;
  
         kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
  }
  
-static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
+void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
                                    unsigned long end)
  {
         /*
@@ -658,12 +646,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
  
         trace_kvm_unmap_hva_range(range->start, range->end);
  
+       /*
+        * Prevent memslot modification between range_start() and range_end()
+        * so that conditionally locking provides the same result in both
+        * functions.  Without that guarantee, the mmu_notifier_count
+        * adjustments will be imbalanced.
+        *
+        * Pairs with the decrement in range_end().
+        */
+       spin_lock(&kvm->mn_invalidate_lock);
+       kvm->mn_active_invalidate_count++;
+       spin_unlock(&kvm->mn_invalidate_lock);
+
         __kvm_handle_hva_range(kvm, &hva_range);
  
         return 0;
  }
  
-static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
+void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
                                    unsigned long end)
  {
         /*
@@ -694,9 +694,22 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
                 .flush_on_ret   = false,
                 .may_block      = mmu_notifier_range_blockable(range),
         };
+       bool wake;
  
         __kvm_handle_hva_range(kvm, &hva_range);
  
+       /* Pairs with the increment in range_start(). */
+       spin_lock(&kvm->mn_invalidate_lock);
+       wake = (--kvm->mn_active_invalidate_count == 0);
+       spin_unlock(&kvm->mn_invalidate_lock);
+
+       /*
+        * There can only be one waiter, since the wait happens under
+        * slots_lock.
+        */
+       if (wake)
+               rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
+
         BUG_ON(kvm->mmu_notifier_count < 0);
  }
  
@@ -897,7 +910,7 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
         char dir_name[ITOA_MAX_LEN * 2];
         struct kvm_stat_data *stat_data;
         const struct _kvm_stats_desc *pdesc;
-       int i;
+       int i, ret;
         int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
                                       kvm_vcpu_stats_header.num_desc;
  
@@ -954,6 +967,13 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
                                     kvm->debugfs_dentry, stat_data,
                                     &stat_fops_per_vm);
         }
+
+       ret = kvm_arch_create_vm_debugfs(kvm);
+       if (ret) {
+               kvm_destroy_vm_debugfs(kvm);
+               return i;
+       }
+
         return 0;
  }
  
@@ -974,6 +994,17 @@ void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
  {
  }
  
+/*
+ * Called after per-vm debugfs created.  When called kvm->debugfs_dentry should
+ * be setup already, so we can create arch-specific debugfs entries under it.
+ * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
+ * a per-arch destroy interface is not needed.
+ */
+int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
+{
+       return 0;
+}
+
  static struct kvm *kvm_create_vm(unsigned long type)
  {
         struct kvm *kvm = kvm_arch_alloc_vm();
@@ -991,6 +1022,9 @@ static struct kvm *kvm_create_vm(unsigned long type)
         mutex_init(&kvm->irq_lock);
         mutex_init(&kvm->slots_lock);
         mutex_init(&kvm->slots_arch_lock);
+       spin_lock_init(&kvm->mn_invalidate_lock);
+       rcuwait_init(&kvm->mn_memslots_update_rcuwait);
+
         INIT_LIST_HEAD(&kvm->devices);
  
         BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
@@ -1113,6 +1147,16 @@ static void kvm_destroy_vm(struct kvm *kvm)
         kvm_coalesced_mmio_free(kvm);
  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
         mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+       /*
+        * At this point, pending calls to invalidate_range_start()
+        * have completed but no more MMU notifiers will run, so
+        * mn_active_invalidate_count may remain unbalanced.
+        * No threads can be waiting in install_new_memslots as the
+        * last reference on KVM has been dropped, but freeing
+        * memslots would deadlock without this manual intervention.
+        */
+       WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
+       kvm->mn_active_invalidate_count = 0;
  #else
         kvm_arch_flush_shadow_all(kvm);
  #endif
@@ -1134,6 +1178,16 @@ void kvm_get_kvm(struct kvm *kvm)
  }
  EXPORT_SYMBOL_GPL(kvm_get_kvm);
  
+/*
+ * Make sure the vm is not during destruction, which is a safe version of
+ * kvm_get_kvm().  Return true if kvm referenced successfully, false otherwise.
+ */
+bool kvm_get_kvm_safe(struct kvm *kvm)
+{
+       return refcount_inc_not_zero(&kvm->users_count);
+}
+EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
+
  void kvm_put_kvm(struct kvm *kvm)
  {
         if (refcount_dec_and_test(&kvm->users_count))
@@ -1194,8 +1248,8 @@ static inline void kvm_memslot_delete(struct kvm_memslots *slots,
  
         slots->used_slots--;
  
-       if (atomic_read(&slots->lru_slot) >= slots->used_slots)
-               atomic_set(&slots->lru_slot, 0);
+       if (atomic_read(&slots->last_used_slot) >= slots->used_slots)
+               atomic_set(&slots->last_used_slot, 0);
  
         for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) {
                 mslots[i] = mslots[i + 1];
@@ -1364,7 +1418,22 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
         WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
         slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
  
+       /*
+        * Do not store the new memslots while there are invalidations in
+        * progress, otherwise the locking in invalidate_range_start and
+        * invalidate_range_end will be unbalanced.
+        */
+       spin_lock(&kvm->mn_invalidate_lock);
+       prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
+       while (kvm->mn_active_invalidate_count) {
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               spin_unlock(&kvm->mn_invalidate_lock);
+               schedule();
+               spin_lock(&kvm->mn_invalidate_lock);
+       }
+       finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
         rcu_assign_pointer(kvm->memslots[as_id], slots);
+       spin_unlock(&kvm->mn_invalidate_lock);
  
         /*
          * Acquired in kvm_set_memslot. Must be released before synchronize
@@ -1980,7 +2049,26 @@ EXPORT_SYMBOL_GPL(gfn_to_memslot);
  
  struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
  {
-       return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
+       struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
+       struct kvm_memory_slot *slot;
+       int slot_index;
+
+       slot = try_get_memslot(slots, vcpu->last_used_slot, gfn);
+       if (slot)
+               return slot;
+
+       /*
+        * Fall back to searching all memslots. We purposely use
+        * search_memslots() instead of __gfn_to_memslot() to avoid
+        * thrashing the VM-wide last_used_index in kvm_memslots.
+        */
+       slot = search_memslots(slots, gfn, &slot_index);
+       if (slot) {
+               vcpu->last_used_slot = slot_index;
+               return slot;
+       }
+
+       return NULL;
  }
  EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
  
@@ -2239,7 +2327,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
          * Get a reference here because callers of *hva_to_pfn* and
          * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
          * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
-        * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
+        * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
          * simply do nothing for reserved pfns.
          *
          * Whoever called remap_pfn_range is also going to call e.g.
@@ -2636,13 +2724,6 @@ void kvm_set_pfn_accessed(kvm_pfn_t pfn)
  }
  EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
  
-void kvm_get_pfn(kvm_pfn_t pfn)
-{
-       if (!kvm_is_reserved_pfn(pfn))
-               get_page(pfn_to_page(pfn));
-}
-EXPORT_SYMBOL_GPL(kvm_get_pfn);
-
  static int next_segment(unsigned long len, int offset)
  {
         if (len > PAGE_SIZE - offset)
@@ -3122,13 +3203,23 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
                                 ++vcpu->stat.generic.halt_successful_poll;
                                 if (!vcpu_valid_wakeup(vcpu))
                                         ++vcpu->stat.generic.halt_poll_invalid;
+
+                               KVM_STATS_LOG_HIST_UPDATE(
+                                     vcpu->stat.generic.halt_poll_success_hist,
+                                     ktime_to_ns(ktime_get()) -
+                                     ktime_to_ns(start));
                                 goto out;
                         }
                         cpu_relax();
                         poll_end = cur = ktime_get();
                 } while (kvm_vcpu_can_poll(cur, stop));
+
+               KVM_STATS_LOG_HIST_UPDATE(
+                               vcpu->stat.generic.halt_poll_fail_hist,
+                               ktime_to_ns(ktime_get()) - ktime_to_ns(start));
         }
  
+
         prepare_to_rcuwait(&vcpu->wait);
         for (;;) {
                 set_current_state(TASK_INTERRUPTIBLE);
@@ -3141,6 +3232,12 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
         }
         finish_rcuwait(&vcpu->wait);
         cur = ktime_get();
+       if (waited) {
+               vcpu->stat.generic.halt_wait_ns +=
+                       ktime_to_ns(cur) - ktime_to_ns(poll_end);
+               KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
+                               ktime_to_ns(cur) - ktime_to_ns(poll_end));
+       }
  out:
         kvm_arch_vcpu_unblocking(vcpu);
         block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
@@ -3612,7 +3709,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
         struct kvm_fpu *fpu = NULL;
         struct kvm_sregs *kvm_sregs = NULL;
  
-       if (vcpu->kvm->mm != current->mm)
+       if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
                 return -EIO;
  
         if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
@@ -3822,7 +3919,7 @@ static long kvm_vcpu_compat_ioctl(struct file *filp,
         void __user *argp = compat_ptr(arg);
         int r;
  
-       if (vcpu->kvm->mm != current->mm)
+       if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
                 return -EIO;
  
         switch (ioctl) {
@@ -3888,7 +3985,7 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
  {
         struct kvm_device *dev = filp->private_data;
  
-       if (dev->kvm->mm != current->mm)
+       if (dev->kvm->mm != current->mm || dev->kvm->vm_bugged)
                 return -EIO;
  
         switch (ioctl) {
@@ -4210,7 +4307,7 @@ static long kvm_vm_ioctl(struct file *filp,
         void __user *argp = (void __user *)arg;
         int r;
  
-       if (kvm->mm != current->mm)
+       if (kvm->mm != current->mm || kvm->vm_bugged)
                 return -EIO;
         switch (ioctl) {
         case KVM_CREATE_VCPU:
@@ -4421,7 +4518,7 @@ static long kvm_vm_compat_ioctl(struct file *filp,
         struct kvm *kvm = filp->private_data;
         int r;
  
-       if (kvm->mm != current->mm)
+       if (kvm->mm != current->mm || kvm->vm_bugged)
                 return -EIO;
         switch (ioctl) {
  #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
@@ -4983,12 +5080,12 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file,
         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
                                           inode->i_private;
  
-       /* The debugfs files are a reference to the kvm struct which
-        * is still valid when kvm_destroy_vm is called.
-        * To avoid the race between open and the removal of the debugfs
-        * directory we test against the users count.
+       /*
+        * The debugfs files are a reference to the kvm struct which
+        * is still valid when kvm_destroy_vm is called.  kvm_get_kvm_safe
+        * avoids the race between open and the removal of the debugfs directory.
          */
-       if (!refcount_inc_not_zero(&stat_data->kvm->users_count))
+       if (!kvm_get_kvm_safe(stat_data->kvm))
                 return -ENOENT;
  
         if (simple_attr_open(inode, file, get,
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 7 Sep 2021 20:40:51 +0000 (13:40 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 7 Sep 2021 20:40:51 +0000 (13:40 -0700)
Documentation/virt/kvm/api.rst		patch \| blob \| history
Documentation/virt/kvm/locking.rst		patch \| blob \| history
arch/arm64/include/asm/cpufeature.h		patch \| blob \| history
arch/arm64/include/asm/kvm_arm.h		patch \| blob \| history
arch/arm64/include/asm/kvm_asm.h		patch \| blob \| history
arch/arm64/include/asm/kvm_host.h		patch \| blob \| history
arch/arm64/include/asm/kvm_hyp.h		patch \| blob \| history
arch/arm64/include/asm/kvm_mmu.h		patch \| blob \| history
arch/arm64/include/asm/kvm_pgtable.h		patch \| blob \| history
arch/arm64/include/asm/sysreg.h		patch \| blob \| history
arch/arm64/kernel/cpufeature.c		patch \| blob \| history
arch/arm64/kernel/vmlinux.lds.S		patch \| blob \| history
arch/arm64/kvm/Kconfig		patch \| blob \| history
arch/arm64/kvm/arm.c		patch \| blob \| history
arch/arm64/kvm/debug.c		patch \| blob \| history
arch/arm64/kvm/guest.c		patch \| blob \| history
arch/arm64/kvm/handle_exit.c		patch \| blob \| history
arch/arm64/kvm/hyp/include/hyp/switch.h		patch \| blob \| history
arch/arm64/kvm/hyp/include/nvhe/mem_protect.h		patch \| blob \| history
arch/arm64/kvm/hyp/include/nvhe/mm.h		patch \| blob \| history
arch/arm64/kvm/hyp/include/nvhe/spinlock.h		patch \| blob \| history
arch/arm64/kvm/hyp/nvhe/debug-sr.c		patch \| blob \| history
arch/arm64/kvm/hyp/nvhe/host.S		patch \| blob \| history
arch/arm64/kvm/hyp/nvhe/hyp-main.c		patch \| blob \| history
arch/arm64/kvm/hyp/nvhe/mem_protect.c		patch \| blob \| history
arch/arm64/kvm/hyp/nvhe/mm.c		patch \| blob \| history
arch/arm64/kvm/hyp/nvhe/setup.c		patch \| blob \| history
arch/arm64/kvm/hyp/nvhe/switch.c		patch \| blob \| history
arch/arm64/kvm/hyp/nvhe/tlb.c		patch \| blob \| history
arch/arm64/kvm/hyp/pgtable.c		patch \| blob \| history
arch/arm64/kvm/hyp/vhe/debug-sr.c		patch \| blob \| history
arch/arm64/kvm/hyp/vhe/switch.c		patch \| blob \| history
arch/arm64/kvm/hyp/vhe/sysreg-sr.c		patch \| blob \| history
arch/arm64/kvm/hyp/vhe/tlb.c		patch \| blob \| history
arch/arm64/kvm/mmu.c		patch \| blob \| history
arch/arm64/kvm/perf.c		patch \| blob \| history
arch/arm64/kvm/pmu-emul.c		patch \| blob \| history
arch/arm64/kvm/psci.c		patch \| blob \| history
arch/arm64/kvm/reset.c		patch \| blob \| history
arch/arm64/kvm/sys_regs.c		patch \| blob \| history
arch/arm64/kvm/sys_regs.h		patch \| blob \| history
arch/arm64/kvm/trace_handle_exit.h		patch \| blob \| history
arch/arm64/kvm/vgic/vgic-mmio-v2.c		patch \| blob \| history
arch/arm64/kvm/vgic/vgic-v2.c		patch \| blob \| history
arch/arm64/kvm/vgic/vgic-v3.c		patch \| blob \| history
arch/arm64/kvm/vgic/vgic.c		patch \| blob \| history
arch/arm64/kvm/vgic/vgic.h		patch \| blob \| history
arch/mips/kvm/mips.c		patch \| blob \| history
arch/mips/kvm/vz.c		patch \| blob \| history
arch/powerpc/include/asm/kvm_host.h		patch \| blob \| history
arch/powerpc/kvm/book3s.c		patch \| blob \| history
arch/powerpc/kvm/book3s_64_vio.c		patch \| blob \| history
arch/powerpc/kvm/book3s_64_vio_hv.c		patch \| blob \| history
arch/powerpc/kvm/book3s_hv.c		patch \| blob \| history
arch/powerpc/kvm/booke.c		patch \| blob \| history
arch/s390/include/asm/kvm_host.h		patch \| blob \| history
arch/s390/kvm/interrupt.c		patch \| blob \| history
arch/s390/kvm/kvm-s390.c		patch \| blob \| history
arch/s390/kvm/kvm-s390.h		patch \| blob \| history
arch/s390/kvm/vsie.c		patch \| blob \| history
arch/x86/include/asm/kvm-x86-ops.h		patch \| blob \| history
arch/x86/include/asm/kvm_host.h		patch \| blob \| history
arch/x86/include/uapi/asm/kvm.h		patch \| blob \| history
arch/x86/kernel/kvm.c		patch \| blob \| history
arch/x86/kvm/debugfs.c		patch \| blob \| history
arch/x86/kvm/hyperv.c		patch \| blob \| history
arch/x86/kvm/i8254.c		patch \| blob \| history
arch/x86/kvm/ioapic.h		patch \| blob \| history
arch/x86/kvm/lapic.c		patch \| blob \| history
arch/x86/kvm/mmu.h		patch \| blob \| history
arch/x86/kvm/mmu/mmu.c		patch \| blob \| history
arch/x86/kvm/mmu/mmu_audit.c		patch \| blob \| history
arch/x86/kvm/mmu/mmu_internal.h		patch \| blob \| history
arch/x86/kvm/mmu/mmutrace.h		patch \| blob \| history
arch/x86/kvm/mmu/page_track.c		patch \| blob \| history
arch/x86/kvm/mmu/paging_tmpl.h		patch \| blob \| history
arch/x86/kvm/mmu/tdp_mmu.c		patch \| blob \| history
arch/x86/kvm/mmu/tdp_mmu.h		patch \| blob \| history
arch/x86/kvm/pmu.c		patch \| blob \| history
arch/x86/kvm/pmu.h		patch \| blob \| history
arch/x86/kvm/svm/avic.c		patch \| blob \| history
arch/x86/kvm/svm/nested.c		patch \| blob \| history
arch/x86/kvm/svm/sev.c		patch \| blob \| history
arch/x86/kvm/svm/svm.c		patch \| blob \| history
arch/x86/kvm/svm/svm.h		patch \| blob \| history
arch/x86/kvm/svm/svm_ops.h		patch \| blob \| history
arch/x86/kvm/vmx/evmcs.c		patch \| blob \| history
arch/x86/kvm/vmx/evmcs.h		patch \| blob \| history
arch/x86/kvm/vmx/nested.c		patch \| blob \| history
arch/x86/kvm/vmx/pmu_intel.c		patch \| blob \| history
arch/x86/kvm/vmx/vmcs.h		patch \| blob \| history
arch/x86/kvm/vmx/vmcs12.c		patch \| blob \| history
arch/x86/kvm/vmx/vmcs12.h		patch \| blob \| history
arch/x86/kvm/vmx/vmx.c		patch \| blob \| history
arch/x86/kvm/vmx/vmx.h		patch \| blob \| history
arch/x86/kvm/vmx/vmx_ops.h		patch \| blob \| history
arch/x86/kvm/x86.c		patch \| blob \| history
arch/x86/kvm/x86.h		patch \| blob \| history
arch/x86/kvm/xen.c		patch \| blob \| history
arch/x86/kvm/xen.h		patch \| blob \| history
include/linux/entry-kvm.h		patch \| blob \| history
include/linux/kvm_host.h		patch \| blob \| history
include/linux/kvm_types.h		patch \| blob \| history
include/linux/page-flags.h		patch \| blob \| history
include/uapi/linux/kvm.h		patch \| blob \| history
tools/testing/selftests/kvm/.gitignore		patch \| blob \| history
tools/testing/selftests/kvm/Makefile		patch \| blob \| history
tools/testing/selftests/kvm/aarch64/psci_cpu_on_test.c	[new file with mode: 0644]	patch \| blob
tools/testing/selftests/kvm/access_tracking_perf_test.c		patch \| blob \| history
tools/testing/selftests/kvm/demand_paging_test.c		patch \| blob \| history
tools/testing/selftests/kvm/dirty_log_perf_test.c		patch \| blob \| history
tools/testing/selftests/kvm/include/aarch64/processor.h		patch \| blob \| history
tools/testing/selftests/kvm/include/perf_test_util.h		patch \| blob \| history
tools/testing/selftests/kvm/kvm_binary_stats_test.c		patch \| blob \| history
tools/testing/selftests/kvm/lib/perf_test_util.c		patch \| blob \| history
tools/testing/selftests/kvm/memslot_modification_stress_test.c		patch \| blob \| history
tools/testing/selftests/kvm/x86_64/debug_regs.c		patch \| blob \| history
virt/kvm/binary_stats.c		patch \| blob \| history
virt/kvm/dirty_ring.c		patch \| blob \| history
virt/kvm/kvm_main.c		patch \| blob \| history