Merge branch 'kvm-older-features' into HEAD
authorPaolo Bonzini <pbonzini@redhat.com>
Fri, 8 Apr 2022 16:43:40 +0000 (12:43 -0400)
committerPaolo Bonzini <pbonzini@redhat.com>
Wed, 13 Apr 2022 17:37:17 +0000 (13:37 -0400)
Merge branch for features that did not make it into 5.18:

* New ioctls to get/set TSC frequency for a whole VM

* Allow userspace to opt out of hypercall patching

Nested virtualization improvements for AMD:

* Support for "nested nested" optimizations (nested vVMLOAD/VMSAVE,
  nested vGIF)

* Allow AVIC to co-exist with a nested guest running

* Fixes for LBR virtualizations when a nested guest is running,
  and nested LBR virtualization support

* PAUSE filtering for nested hypervisors

Guest support:

* Decoupling of vcpu_is_preempted from PV spinlocks

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
1  2 
Documentation/virt/kvm/api.rst
arch/x86/include/asm/kvm_host.h
arch/x86/kernel/kvm.c
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/svm/avic.c
arch/x86/kvm/x86.c
tools/testing/selftests/kvm/.gitignore
tools/testing/selftests/kvm/Makefile

@@@ -982,12 -982,22 +982,22 @@@ memory
        __u8 pad2[30];
    };
  
- If the KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL flag is returned from the
- KVM_CAP_XEN_HVM check, it may be set in the flags field of this ioctl.
- This requests KVM to generate the contents of the hypercall page
- automatically; hypercalls will be intercepted and passed to userspace
- through KVM_EXIT_XEN.  In this case, all of the blob size and address
- fields must be zero.
+ If certain flags are returned from the KVM_CAP_XEN_HVM check, they may
+ be set in the flags field of this ioctl:
+ The KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL flag requests KVM to generate
+ the contents of the hypercall page automatically; hypercalls will be
+ intercepted and passed to userspace through KVM_EXIT_XEN.  In this
+ ase, all of the blob size and address fields must be zero.
+ The KVM_XEN_HVM_CONFIG_EVTCHN_SEND flag indicates to KVM that userspace
+ will always use the KVM_XEN_HVM_EVTCHN_SEND ioctl to deliver event
+ channel interrupts rather than manipulating the guest's shared_info
+ structures directly. This, in turn, may allow KVM to enable features
+ such as intercepting the SCHEDOP_poll hypercall to accelerate PV
+ spinlock operation for the guest. Userspace may still use the ioctl
+ to deliver events if it was advertised, even if userspace does not
+ send this indication that it will always do so
  
  No other flags are currently valid in the struct kvm_xen_hvm_config.
  
@@@ -1887,22 -1897,25 +1897,25 @@@ the future
  4.55 KVM_SET_TSC_KHZ
  --------------------
  
- :Capability: KVM_CAP_TSC_CONTROL
+ :Capability: KVM_CAP_TSC_CONTROL / KVM_CAP_VM_TSC_CONTROL
  :Architectures: x86
- :Type: vcpu ioctl
+ :Type: vcpu ioctl / vm ioctl
  :Parameters: virtual tsc_khz
  :Returns: 0 on success, -1 on error
  
  Specifies the tsc frequency for the virtual machine. The unit of the
  frequency is KHz.
  
+ If the KVM_CAP_VM_TSC_CONTROL capability is advertised, this can also
+ be used as a vm ioctl to set the initial tsc frequency of subsequently
+ created vCPUs.
  
  4.56 KVM_GET_TSC_KHZ
  --------------------
  
- :Capability: KVM_CAP_GET_TSC_KHZ
+ :Capability: KVM_CAP_GET_TSC_KHZ / KVM_CAP_VM_TSC_CONTROL
  :Architectures: x86
- :Type: vcpu ioctl
+ :Type: vcpu ioctl / vm ioctl
  :Parameters: none
  :Returns: virtual tsc-khz on success, negative value on error
  
@@@ -5216,7 -5229,25 +5229,25 @@@ have deterministic behavior
                struct {
                        __u64 gfn;
                } shared_info;
-               __u64 pad[4];
+               struct {
+                       __u32 send_port;
+                       __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */
+                       __u32 flags;
+                       union {
+                               struct {
+                                       __u32 port;
+                                       __u32 vcpu;
+                                       __u32 priority;
+                               } port;
+                               struct {
+                                       __u32 port; /* Zero for eventfd */
+                                       __s32 fd;
+                               } eventfd;
+                               __u32 padding[4];
+                       } deliver;
+               } evtchn;
+               __u32 xen_version;
+               __u64 pad[8];
        } u;
    };
  
@@@ -5247,6 -5278,30 +5278,30 @@@ KVM_XEN_ATTR_TYPE_SHARED_INF
  
  KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
    Sets the exception vector used to deliver Xen event channel upcalls.
+   This is the HVM-wide vector injected directly by the hypervisor
+   (not through the local APIC), typically configured by a guest via
+   HVM_PARAM_CALLBACK_IRQ.
+ KVM_XEN_ATTR_TYPE_EVTCHN
+   This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+   support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It configures
+   an outbound port number for interception of EVTCHNOP_send requests
+   from the guest. A given sending port number may be directed back
+   to a specified vCPU (by APIC ID) / port / priority on the guest,
+   or to trigger events on an eventfd. The vCPU and priority can be
+   changed by setting KVM_XEN_EVTCHN_UPDATE in a subsequent call,
+   but other fields cannot change for a given sending port. A port
+   mapping is removed by using KVM_XEN_EVTCHN_DEASSIGN in the flags
+   field.
+ KVM_XEN_ATTR_TYPE_XEN_VERSION
+   This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+   support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It configures
+   the 32-bit version code returned to the guest when it invokes the
+   XENVER_version call; typically (XEN_MAJOR << 16 | XEN_MINOR). PV
+   Xen guests will often use this to as a dummy hypercall to trigger
+   event channel delivery, so responding within the kernel without
+   exiting to userspace is beneficial.
  
  4.127 KVM_XEN_HVM_GET_ATTR
  --------------------------
  :Returns: 0 on success, < 0 on error
  
  Allows Xen VM attributes to be read. For the structure and types,
- see KVM_XEN_HVM_SET_ATTR above.
+ see KVM_XEN_HVM_SET_ATTR above. The KVM_XEN_ATTR_TYPE_EVTCHN
+ attribute cannot be read.
  
  4.128 KVM_XEN_VCPU_SET_ATTR
  ---------------------------
                        __u64 time_blocked;
                        __u64 time_offline;
                } runstate;
+               __u32 vcpu_id;
+               struct {
+                       __u32 port;
+                       __u32 priority;
+                       __u64 expires_ns;
+               } timer;
+               __u8 vector;
        } u;
    };
  
@@@ -5326,6 -5389,27 +5389,27 @@@ KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUS
    or RUNSTATE_offline) to set the current accounted state as of the
    adjusted state_entry_time.
  
+ KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID
+   This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+   support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the Xen
+   vCPU ID of the given vCPU, to allow timer-related VCPU operations to
+   be intercepted by KVM.
+ KVM_XEN_VCPU_ATTR_TYPE_TIMER
+   This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+   support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the
+   event channel port/priority for the VIRQ_TIMER of the vCPU, as well
+   as allowing a pending timer to be saved/restored.
+ KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR
+   This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+   support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the
+   per-vCPU local APIC upcall vector, configured by a Xen guest with
+   the HVMOP_set_evtchn_upcall_vector hypercall. This is typically
+   used by Windows guests, and is distinct from the HVM-wide upcall
+   vector configured with HVM_PARAM_CALLBACK_IRQ.
  4.129 KVM_XEN_VCPU_GET_ATTR
  ---------------------------
  
@@@ -5645,6 -5729,25 +5729,25 @@@ enabled with ``arch_prctl()``, but thi
  The offsets of the state save areas in struct kvm_xsave follow the contents
  of CPUID leaf 0xD on the host.
  
+ 4.135 KVM_XEN_HVM_EVTCHN_SEND
+ -----------------------------
+ :Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND
+ :Architectures: x86
+ :Type: vm ioctl
+ :Parameters: struct kvm_irq_routing_xen_evtchn
+ :Returns: 0 on success, < 0 on error
+ ::
+    struct kvm_irq_routing_xen_evtchn {
+       __u32 port;
+       __u32 vcpu;
+       __u32 priority;
+    };
+ This ioctl injects an event channel interrupt directly to the guest vCPU.
  
  5. The kvm_run structure
  ========================
@@@ -6190,7 -6293,6 +6293,7 @@@ Valid values for 'type' are
                        unsigned long args[6];
                        unsigned long ret[2];
                } riscv_sbi;
 +
  If exit reason is KVM_EXIT_RISCV_SBI then it indicates that the VCPU has
  done a SBI call which is not handled by KVM RISC-V kernel module. The details
  of the SBI call are available in 'riscv_sbi' member of kvm_run structure. The
@@@ -7135,6 -7237,15 +7238,15 @@@ The valid bits in cap.args[0] are
                                      Additionally, when this quirk is disabled,
                                      KVM clears CPUID.01H:ECX[bit 3] if
                                      IA32_MISC_ENABLE[bit 18] is cleared.
+  KVM_X86_QUIRK_FIX_HYPERCALL_INSN   By default, KVM rewrites guest
+                                     VMMCALL/VMCALL instructions to match the
+                                     vendor's hypercall instruction for the
+                                     system. When this quirk is disabled, KVM
+                                     will no longer rewrite invalid guest
+                                     hypercall instructions. Executing the
+                                     incorrect hypercall instruction will
+                                     generate a #UD within the guest.
  =================================== ============================================
  
  8. Other capabilities.
@@@ -7612,8 -7723,9 +7724,9 @@@ PVHVM guests. Valid flags are:
    #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR    (1 << 0)
    #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL  (1 << 1)
    #define KVM_XEN_HVM_CONFIG_SHARED_INFO      (1 << 2)
-   #define KVM_XEN_HVM_CONFIG_RUNSTATE         (1 << 2)
-   #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL    (1 << 3)
+   #define KVM_XEN_HVM_CONFIG_RUNSTATE         (1 << 3)
+   #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL    (1 << 4)
+   #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND      (1 << 5)
  
  The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG
  ioctl is available, for the guest to set its hypercall page.
@@@ -7637,6 -7749,14 +7750,14 @@@ The KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL fl
  of the type KVM_IRQ_ROUTING_XEN_EVTCHN are supported, with the priority
  field set to indicate 2 level event channel delivery.
  
+ The KVM_XEN_HVM_CONFIG_EVTCHN_SEND flag indicates that KVM supports
+ injecting event channel events directly into the guest with the
+ KVM_XEN_HVM_EVTCHN_SEND ioctl. It also indicates support for the
+ KVM_XEN_ATTR_TYPE_EVTCHN/XEN_VERSION HVM attributes and the
+ KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID/TIMER/UPCALL_VECTOR vCPU attributes.
+ related to event channel delivery, timers, and the XENVER_version
+ interception.
  8.31 KVM_CAP_PPC_MULTITCE
  -------------------------
  
@@@ -502,7 -502,6 +502,7 @@@ struct kvm_pmc 
        bool intr;
  };
  
 +#define KVM_PMC_MAX_FIXED     3
  struct kvm_pmu {
        unsigned nr_arch_gp_counters;
        unsigned nr_arch_fixed_counters;
        u64 raw_event_mask;
        u8 version;
        struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
 -      struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
 +      struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED];
        struct irq_work irq_work;
        DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX);
        DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX);
@@@ -607,16 -606,21 +607,21 @@@ struct kvm_vcpu_hv 
  struct kvm_vcpu_xen {
        u64 hypercall_rip;
        u32 current_runstate;
-       bool vcpu_info_set;
-       bool vcpu_time_info_set;
-       bool runstate_set;
-       struct gfn_to_hva_cache vcpu_info_cache;
-       struct gfn_to_hva_cache vcpu_time_info_cache;
-       struct gfn_to_hva_cache runstate_cache;
+       u8 upcall_vector;
+       struct gfn_to_pfn_cache vcpu_info_cache;
+       struct gfn_to_pfn_cache vcpu_time_info_cache;
+       struct gfn_to_pfn_cache runstate_cache;
        u64 last_steal;
        u64 runstate_entry_time;
        u64 runstate_times[4];
        unsigned long evtchn_pending_sel;
+       u32 vcpu_id; /* The Xen / ACPI vCPU ID */
+       u32 timer_virq;
+       u64 timer_expires; /* In guest epoch */
+       atomic_t timer_pending;
+       struct hrtimer timer;
+       int poll_evtchn;
+       struct timer_list poll_timer;
  };
  
  struct kvm_vcpu_arch {
        gpa_t time;
        struct pvclock_vcpu_time_info hv_clock;
        unsigned int hw_tsc_khz;
-       struct gfn_to_hva_cache pv_time;
-       bool pv_time_enabled;
+       struct gfn_to_pfn_cache pv_time;
        /* set guest stopped flag in pvclock flags field */
        bool pvclock_set_guest_stopped_request;
  
@@@ -974,10 -977,12 +978,10 @@@ enum hv_tsc_page_status 
        HV_TSC_PAGE_UNSET = 0,
        /* TSC page MSR was written by the guest, update pending */
        HV_TSC_PAGE_GUEST_CHANGED,
 -      /* TSC page MSR was written by KVM userspace, update pending */
 +      /* TSC page update was triggered from the host side */
        HV_TSC_PAGE_HOST_CHANGED,
        /* TSC page was properly set up and is currently active  */
        HV_TSC_PAGE_SET,
 -      /* TSC page is currently being updated and therefore is inactive */
 -      HV_TSC_PAGE_UPDATING,
        /* TSC page was set up with an inaccessible GPA */
        HV_TSC_PAGE_BROKEN,
  };
@@@ -1024,9 -1029,12 +1028,12 @@@ struct msr_bitmap_range 
  
  /* Xen emulation context */
  struct kvm_xen {
+       u32 xen_version;
        bool long_mode;
        u8 upcall_vector;
        struct gfn_to_pfn_cache shinfo_cache;
+       struct idr evtchn_ports;
+       unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
  };
  
  enum kvm_irqchip_mode {
@@@ -1050,7 -1058,6 +1057,7 @@@ enum kvm_apicv_inhibit 
        APICV_INHIBIT_REASON_X2APIC,
        APICV_INHIBIT_REASON_BLOCKIRQ,
        APICV_INHIBIT_REASON_ABSENT,
 +      APICV_INHIBIT_REASON_SEV,
  };
  
  struct kvm_arch {
        u64 cur_tsc_generation;
        int nr_vcpus_matched_tsc;
  
+       u32 default_tsc_khz;
        seqcount_raw_spinlock_t pvclock_sc;
        bool use_master_clock;
        u64 master_kernel_ns;
@@@ -1498,6 -1507,11 +1507,11 @@@ struct kvm_x86_ops 
        int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
  
        void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
+       /*
+        * Returns vCPU specific APICv inhibit reasons
+        */
+       unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu);
  };
  
  struct kvm_x86_nested_ops {
@@@ -1584,9 -1598,8 +1598,9 @@@ static inline int kvm_arch_flush_remote
  #define kvm_arch_pmi_in_guest(vcpu) \
        ((vcpu) && (vcpu)->arch.handling_intr_from_guest)
  
 -int kvm_mmu_module_init(void);
 -void kvm_mmu_module_exit(void);
 +void kvm_mmu_x86_module_init(void);
 +int kvm_mmu_vendor_module_init(void);
 +void kvm_mmu_vendor_module_exit(void);
  
  void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
  int kvm_mmu_create(struct kvm_vcpu *vcpu);
@@@ -1799,6 -1812,7 +1813,7 @@@ gpa_t kvm_mmu_gva_to_gpa_system(struct 
                                struct x86_exception *exception);
  
  bool kvm_apicv_activated(struct kvm *kvm);
+ bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
  void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
  void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
                                      enum kvm_apicv_inhibit reason, bool set);
@@@ -1988,6 -2002,7 +2003,7 @@@ int memslot_rmap_alloc(struct kvm_memor
         KVM_X86_QUIRK_CD_NW_CLEARED |          \
         KVM_X86_QUIRK_LAPIC_MMIO_HOLE |        \
         KVM_X86_QUIRK_OUT_7E_INC_RIP |         \
-        KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)
+        KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT |   \
+        KVM_X86_QUIRK_FIX_HYPERCALL_INSN)
  
  #endif /* _ASM_X86_KVM_HOST_H */
diff --combined arch/x86/kernel/kvm.c
@@@ -752,6 -752,41 +752,42 @@@ static void kvm_crash_shutdown(struct p
  }
  #endif
  
 -"ret;"
+ #if defined(CONFIG_X86_32) || !defined(CONFIG_SMP)
+ bool __kvm_vcpu_is_preempted(long cpu);
+ __visible bool __kvm_vcpu_is_preempted(long cpu)
+ {
+       struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
+       return !!(src->preempted & KVM_VCPU_PREEMPTED);
+ }
+ PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
+ #else
+ #include <asm/asm-offsets.h>
+ extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
+ /*
+  * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
+  * restoring to/from the stack.
+  */
+ asm(
+ ".pushsection .text;"
+ ".global __raw_callee_save___kvm_vcpu_is_preempted;"
+ ".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
+ "__raw_callee_save___kvm_vcpu_is_preempted:"
++ASM_ENDBR
+ "movq __per_cpu_offset(,%rdi,8), %rax;"
+ "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
+ "setne        %al;"
++ASM_RET
+ ".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
+ ".popsection");
+ #endif
  static void __init kvm_guest_init(void)
  {
        int i;
        if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
                has_steal_clock = 1;
                static_call_update(pv_steal_clock, kvm_steal_clock);
+               pv_ops.lock.vcpu_is_preempted =
+                       PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
        }
  
        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
@@@ -1005,40 -1043,6 +1044,6 @@@ static void kvm_wait(u8 *ptr, u8 val
        }
  }
  
- #ifdef CONFIG_X86_32
- __visible bool __kvm_vcpu_is_preempted(long cpu)
- {
-       struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
-       return !!(src->preempted & KVM_VCPU_PREEMPTED);
- }
- PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
- #else
- #include <asm/asm-offsets.h>
- extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
- /*
-  * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
-  * restoring to/from the stack.
-  */
- asm(
- ".pushsection .text;"
- ".global __raw_callee_save___kvm_vcpu_is_preempted;"
- ".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
- "__raw_callee_save___kvm_vcpu_is_preempted:"
- ASM_ENDBR
- "movq __per_cpu_offset(,%rdi,8), %rax;"
- "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
- "setne        %al;"
- ASM_RET
- ".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
- ".popsection");
- #endif
  /*
   * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
   */
@@@ -1082,10 -1086,6 +1087,6 @@@ void __init kvm_spinlock_init(void
        pv_ops.lock.wait = kvm_wait;
        pv_ops.lock.kick = kvm_kick_cpu;
  
-       if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
-               pv_ops.lock.vcpu_is_preempted =
-                       PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
-       }
        /*
         * When PV spinlock is enabled which is preferred over
         * virt_spin_lock(), virt_spin_lock_key's value is meaningless.
diff --combined arch/x86/kvm/mmu/mmu.c
@@@ -1866,17 -1866,14 +1866,14 @@@ static void kvm_mmu_commit_zap_page(str
          &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)])     \
                if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
  
- static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                         struct list_head *invalid_list)
  {
        int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
  
-       if (ret < 0) {
+       if (ret < 0)
                kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
-               return false;
-       }
-       return !!ret;
+       return ret;
  }
  
  static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
@@@ -1998,7 -1995,7 +1995,7 @@@ static int mmu_sync_children(struct kvm
  
                for_each_sp(pages, sp, parents, i) {
                        kvm_unlink_unsync_page(vcpu->kvm, sp);
-                       flush |= kvm_sync_page(vcpu, sp, &invalid_list);
+                       flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
                        mmu_pages_clear_parents(&parents);
                }
                if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
@@@ -2039,6 -2036,7 +2036,7 @@@ static struct kvm_mmu_page *kvm_mmu_get
        struct hlist_head *sp_list;
        unsigned quadrant;
        struct kvm_mmu_page *sp;
+       int ret;
        int collisions = 0;
        LIST_HEAD(invalid_list);
  
                         * If the sync fails, the page is zapped.  If so, break
                         * in order to rebuild it.
                         */
-                       if (!kvm_sync_page(vcpu, sp, &invalid_list))
+                       ret = kvm_sync_page(vcpu, sp, &invalid_list);
+                       if (ret < 0)
                                break;
  
                        WARN_ON(!list_empty(&invalid_list));
-                       kvm_flush_remote_tlbs(vcpu->kvm);
+                       if (ret > 0)
+                               kvm_flush_remote_tlbs(vcpu->kvm);
                }
  
                __clear_sp_write_flooding_count(sp);
@@@ -6237,24 -6237,12 +6237,24 @@@ static int set_nx_huge_pages(const cha
        return 0;
  }
  
 -int kvm_mmu_module_init(void)
 +/*
 + * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
 + * its default value of -1 is technically undefined behavior for a boolean.
 + */
 +void kvm_mmu_x86_module_init(void)
  {
 -      int ret = -ENOMEM;
 -
        if (nx_huge_pages == -1)
                __set_nx_huge_pages(get_nx_auto_mode());
 +}
 +
 +/*
 + * The bulk of the MMU initialization is deferred until the vendor module is
 + * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
 + * to be reset when a potentially different vendor module is loaded.
 + */
 +int kvm_mmu_vendor_module_init(void)
 +{
 +      int ret = -ENOMEM;
  
        /*
         * MMU roles use union aliasing which is, generally speaking, an
@@@ -6302,7 -6290,7 +6302,7 @@@ void kvm_mmu_destroy(struct kvm_vcpu *v
        mmu_free_memory_caches(vcpu);
  }
  
 -void kvm_mmu_module_exit(void)
 +void kvm_mmu_vendor_module_exit(void)
  {
        mmu_destroy_caches();
        percpu_counter_destroy(&kvm_total_used_mmu_pages);
diff --combined arch/x86/kvm/svm/avic.c
@@@ -165,9 -165,8 +165,8 @@@ free_avic
        return err;
  }
  
- void avic_init_vmcb(struct vcpu_svm *svm)
+ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
  {
-       struct vmcb *vmcb = svm->vmcb;
        struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
        phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
        phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
@@@ -357,6 -356,13 +356,13 @@@ int avic_incomplete_ipi_interception(st
        return 1;
  }
  
+ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
+ {
+       if (is_guest_mode(vcpu))
+               return APICV_INHIBIT_REASON_NESTED;
+       return 0;
+ }
  static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
  {
        struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
@@@ -837,8 -843,7 +843,8 @@@ bool avic_check_apicv_inhibit_reasons(e
                          BIT(APICV_INHIBIT_REASON_IRQWIN) |
                          BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
                          BIT(APICV_INHIBIT_REASON_X2APIC) |
 -                        BIT(APICV_INHIBIT_REASON_BLOCKIRQ);
 +                        BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
 +                        BIT(APICV_INHIBIT_REASON_SEV);
  
        return supported & BIT(reason);
  }
diff --combined arch/x86/kvm/x86.c
@@@ -961,11 -961,13 +961,13 @@@ void kvm_load_guest_xsave_state(struct 
                        wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
        }
  
+ #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        if (static_cpu_has(X86_FEATURE_PKU) &&
-           (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
-            (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
-           vcpu->arch.pkru != vcpu->arch.host_pkru)
+           vcpu->arch.pkru != vcpu->arch.host_pkru &&
+           ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
+            kvm_read_cr4_bits(vcpu, X86_CR4_PKE)))
                write_pkru(vcpu->arch.pkru);
+ #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
  }
  EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
  
@@@ -974,13 -976,15 +976,15 @@@ void kvm_load_host_xsave_state(struct k
        if (vcpu->arch.guest_state_protected)
                return;
  
+ #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        if (static_cpu_has(X86_FEATURE_PKU) &&
-           (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
-            (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
+           ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
+            kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) {
                vcpu->arch.pkru = rdpkru();
                if (vcpu->arch.pkru != vcpu->arch.host_pkru)
                        write_pkru(vcpu->arch.host_pkru);
        }
+ #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
  
        if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
  
@@@ -1771,7 -1775,7 +1775,7 @@@ static int __kvm_set_msr(struct kvm_vcp
                 * value, and that something deterministic happens if the guest
                 * invokes 64-bit SYSENTER.
                 */
 -              data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
 +              data = __canonical_address(data, vcpu_virt_addr_bits(vcpu));
                break;
        case MSR_TSC_AUX:
                if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
@@@ -2249,14 -2253,13 +2253,13 @@@ static void kvm_write_system_time(struc
        kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
  
        /* we verify if the enable bit is set... */
-       vcpu->arch.pv_time_enabled = false;
-       if (!(system_time & 1))
-               return;
-       if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
-                                      &vcpu->arch.pv_time, system_time & ~1ULL,
-                                      sizeof(struct pvclock_vcpu_time_info)))
-               vcpu->arch.pv_time_enabled = true;
+       if (system_time & 1) {
+               kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
+                                         KVM_HOST_USES_PFN, system_time & ~1ULL,
+                                         sizeof(struct pvclock_vcpu_time_info));
+       } else {
+               kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
+       }
  
        return;
  }
@@@ -2901,7 -2904,7 +2904,7 @@@ static void kvm_end_pvclock_update(stru
  
  static void kvm_update_masterclock(struct kvm *kvm)
  {
 -      kvm_hv_invalidate_tsc_page(kvm);
 +      kvm_hv_request_tsc_page_update(kvm);
        kvm_start_pvclock_update(kvm);
        pvclock_update_vm_gtod_copy(kvm);
        kvm_end_pvclock_update(kvm);
@@@ -2961,63 -2964,55 +2964,55 @@@ u64 get_kvmclock_ns(struct kvm *kvm
        return data.clock;
  }
  
- static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
-                                  struct gfn_to_hva_cache *cache,
-                                  unsigned int offset)
+ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
+                                   struct gfn_to_pfn_cache *gpc,
+                                   unsigned int offset)
  {
        struct kvm_vcpu_arch *vcpu = &v->arch;
-       struct pvclock_vcpu_time_info guest_hv_clock;
+       struct pvclock_vcpu_time_info *guest_hv_clock;
+       unsigned long flags;
  
-       if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache,
-               &guest_hv_clock, offset, sizeof(guest_hv_clock))))
-               return;
+       read_lock_irqsave(&gpc->lock, flags);
+       while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
+                                          offset + sizeof(*guest_hv_clock))) {
+               read_unlock_irqrestore(&gpc->lock, flags);
+               if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
+                                                offset + sizeof(*guest_hv_clock)))
+                       return;
  
-       /* This VCPU is paused, but it's legal for a guest to read another
+               read_lock_irqsave(&gpc->lock, flags);
+       }
+       guest_hv_clock = (void *)(gpc->khva + offset);
+       /*
+        * This VCPU is paused, but it's legal for a guest to read another
         * VCPU's kvmclock, so we really have to follow the specification where
         * it says that version is odd if data is being modified, and even after
         * it is consistent.
-        *
-        * Version field updates must be kept separate.  This is because
-        * kvm_write_guest_cached might use a "rep movs" instruction, and
-        * writes within a string instruction are weakly ordered.  So there
-        * are three writes overall.
-        *
-        * As a small optimization, only write the version field in the first
-        * and third write.  The vcpu->pv_time cache is still valid, because the
-        * version field is the first in the struct.
         */
-       BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
-       if (guest_hv_clock.version & 1)
-               ++guest_hv_clock.version;  /* first time write, random junk */
-       vcpu->hv_clock.version = guest_hv_clock.version + 1;
-       kvm_write_guest_offset_cached(v->kvm, cache,
-                                     &vcpu->hv_clock, offset,
-                                     sizeof(vcpu->hv_clock.version));
  
+       guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1;
        smp_wmb();
  
        /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
-       vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+       vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
  
        if (vcpu->pvclock_set_guest_stopped_request) {
                vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
                vcpu->pvclock_set_guest_stopped_request = false;
        }
  
-       trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+       memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock));
+       smp_wmb();
  
-       kvm_write_guest_offset_cached(v->kvm, cache,
-                                     &vcpu->hv_clock, offset,
-                                     sizeof(vcpu->hv_clock));
+       guest_hv_clock->version = ++vcpu->hv_clock.version;
  
-       smp_wmb();
+       mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+       read_unlock_irqrestore(&gpc->lock, flags);
  
-       vcpu->hv_clock.version++;
-       kvm_write_guest_offset_cached(v->kvm, cache,
-                                    &vcpu->hv_clock, offset,
-                                    sizeof(vcpu->hv_clock.version));
+       trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
  }
  
  static int kvm_guest_time_update(struct kvm_vcpu *v)
  
        vcpu->hv_clock.flags = pvclock_flags;
  
-       if (vcpu->pv_time_enabled)
-               kvm_setup_pvclock_page(v, &vcpu->pv_time, 0);
-       if (vcpu->xen.vcpu_info_set)
-               kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache,
-                                      offsetof(struct compat_vcpu_info, time));
-       if (vcpu->xen.vcpu_time_info_set)
-               kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0);
+       if (vcpu->pv_time.active)
+               kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
+       if (vcpu->xen.vcpu_info_cache.active)
+               kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
+                                       offsetof(struct compat_vcpu_info, time));
+       if (vcpu->xen.vcpu_time_info_cache.active)
+               kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
 -      if (!v->vcpu_idx)
 -              kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
 +      kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
        return 0;
  }
  
@@@ -3300,7 -3296,7 +3295,7 @@@ static int kvm_pv_enable_async_pf_int(s
  
  static void kvmclock_reset(struct kvm_vcpu *vcpu)
  {
-       vcpu->arch.pv_time_enabled = false;
+       kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
        vcpu->arch.time = 0;
  }
  
@@@ -4284,7 -4280,8 +4279,8 @@@ int kvm_vm_ioctl_check_extension(struc
                r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
                    KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
                    KVM_XEN_HVM_CONFIG_SHARED_INFO |
-                   KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL;
+                   KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
+                   KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
                if (sched_info_on())
                        r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
                break;
                r = boot_cpu_has(X86_FEATURE_XSAVE);
                break;
        case KVM_CAP_TSC_CONTROL:
+       case KVM_CAP_VM_TSC_CONTROL:
                r = kvm_has_tsc_control;
                break;
        case KVM_CAP_X2APIC_API:
@@@ -5102,7 -5100,7 +5099,7 @@@ static int kvm_vcpu_ioctl_x86_set_xcrs(
   */
  static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
  {
-       if (!vcpu->arch.pv_time_enabled)
+       if (!vcpu->arch.pv_time.active)
                return -EINVAL;
        vcpu->arch.pvclock_set_guest_stopped_request = true;
        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@@ -6186,7 -6184,7 +6183,7 @@@ static int kvm_arch_suspend_notifier(st
  
        mutex_lock(&kvm->lock);
        kvm_for_each_vcpu(i, vcpu, kvm) {
-               if (!vcpu->arch.pv_time_enabled)
+               if (!vcpu->arch.pv_time.active)
                        continue;
  
                ret = kvm_set_guest_paused(vcpu);
@@@ -6240,7 -6238,7 +6237,7 @@@ static int kvm_vm_ioctl_set_clock(struc
        if (data.flags & ~KVM_CLOCK_VALID_FLAGS)
                return -EINVAL;
  
 -      kvm_hv_invalidate_tsc_page(kvm);
 +      kvm_hv_request_tsc_page_update(kvm);
        kvm_start_pvclock_update(kvm);
        pvclock_update_vm_gtod_copy(kvm);
  
@@@ -6513,6 -6511,15 +6510,15 @@@ set_pit2_out
                r = kvm_xen_hvm_set_attr(kvm, &xha);
                break;
        }
+       case KVM_XEN_HVM_EVTCHN_SEND: {
+               struct kvm_irq_routing_xen_evtchn uxe;
+               r = -EFAULT;
+               if (copy_from_user(&uxe, argp, sizeof(uxe)))
+                       goto out;
+               r = kvm_xen_hvm_evtchn_send(kvm, &uxe);
+               break;
+       }
  #endif
        case KVM_SET_CLOCK:
                r = kvm_vm_ioctl_set_clock(kvm, argp);
        case KVM_GET_CLOCK:
                r = kvm_vm_ioctl_get_clock(kvm, argp);
                break;
+       case KVM_SET_TSC_KHZ: {
+               u32 user_tsc_khz;
+               r = -EINVAL;
+               user_tsc_khz = (u32)arg;
+               if (kvm_has_tsc_control &&
+                   user_tsc_khz >= kvm_max_guest_tsc_khz)
+                       goto out;
+               if (user_tsc_khz == 0)
+                       user_tsc_khz = tsc_khz;
+               WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
+               r = 0;
+               goto out;
+       }
+       case KVM_GET_TSC_KHZ: {
+               r = READ_ONCE(kvm->arch.default_tsc_khz);
+               goto out;
+       }
        case KVM_MEMORY_ENCRYPT_OP: {
                r = -ENOTTY;
                if (!kvm_x86_ops.mem_enc_ioctl)
@@@ -6584,7 -6613,7 +6612,7 @@@ static void kvm_init_msr_list(void
        u32 dummy[2];
        unsigned i;
  
 -      BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
 +      BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3,
                         "Please update the fixed PMCs in msrs_to_saved_all[]");
  
        perf_get_x86_pmu_capability(&x86_pmu);
@@@ -8789,22 -8818,22 +8817,22 @@@ static int kvmclock_cpu_online(unsigne
  
  static void kvm_timer_init(void)
  {
-       max_tsc_khz = tsc_khz;
        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
- #ifdef CONFIG_CPU_FREQ
-               struct cpufreq_policy *policy;
-               int cpu;
-               cpu = get_cpu();
-               policy = cpufreq_cpu_get(cpu);
-               if (policy) {
-                       if (policy->cpuinfo.max_freq)
-                               max_tsc_khz = policy->cpuinfo.max_freq;
-                       cpufreq_cpu_put(policy);
+               max_tsc_khz = tsc_khz;
+               if (IS_ENABLED(CONFIG_CPU_FREQ)) {
+                       struct cpufreq_policy *policy;
+                       int cpu;
+                       cpu = get_cpu();
+                       policy = cpufreq_cpu_get(cpu);
+                       if (policy) {
+                               if (policy->cpuinfo.max_freq)
+                                       max_tsc_khz = policy->cpuinfo.max_freq;
+                               cpufreq_cpu_put(policy);
+                       }
+                       put_cpu();
                }
-               put_cpu();
- #endif
                cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
                                          CPUFREQ_TRANSITION_NOTIFIER);
        }
@@@ -8925,7 -8954,7 +8953,7 @@@ int kvm_arch_init(void *opaque
        }
        kvm_nr_uret_msrs = 0;
  
 -      r = kvm_mmu_module_init();
 +      r = kvm_mmu_vendor_module_init();
        if (r)
                goto out_free_percpu;
  
        }
  
        if (pi_inject_timer == -1)
 -              pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
 +              pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER);
  #ifdef CONFIG_X86_64
        pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
  
@@@ -8973,7 -9002,7 +9001,7 @@@ void kvm_arch_exit(void
        cancel_work_sync(&pvclock_gtod_work);
  #endif
        kvm_x86_ops.hardware_enable = NULL;
 -      kvm_mmu_module_exit();
 +      kvm_mmu_vendor_module_exit();
        free_percpu(user_return_msrs);
        kmem_cache_destroy(x86_emulator_cache);
  #ifdef CONFIG_KVM_XEN
@@@ -9089,6 -9118,14 +9117,14 @@@ bool kvm_apicv_activated(struct kvm *kv
  }
  EXPORT_SYMBOL_GPL(kvm_apicv_activated);
  
+ bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
+ {
+       ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
+       ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu);
+       return (vm_reasons | vcpu_reasons) == 0;
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated);
  
  static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
                                       enum kvm_apicv_inhibit reason, bool set)
@@@ -9266,6 -9303,17 +9302,17 @@@ static int emulator_fix_hypercall(struc
        char instruction[3];
        unsigned long rip = kvm_rip_read(vcpu);
  
+       /*
+        * If the quirk is disabled, synthesize a #UD and let the guest pick up
+        * the pieces.
+        */
+       if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) {
+               ctxt->exception.error_code_valid = false;
+               ctxt->exception.vector = UD_VECTOR;
+               ctxt->have_exception = true;
+               return X86EMUL_PROPAGATE_FAULT;
+       }
        static_call(kvm_x86_patch_hypercall)(vcpu, instruction);
  
        return emulator_write_emulated(ctxt, rip, instruction, 3,
@@@ -9763,7 -9811,8 +9810,8 @@@ void kvm_vcpu_update_apicv(struct kvm_v
  
        down_read(&vcpu->kvm->arch.apicv_update_lock);
  
-       activate = kvm_apicv_activated(vcpu->kvm);
+       activate = kvm_vcpu_apicv_activated(vcpu);
        if (vcpu->arch.apicv_active == activate)
                goto out;
  
@@@ -10164,7 -10213,7 +10212,7 @@@ static int vcpu_enter_guest(struct kvm_
                 * per-VM state, and responsing vCPUs must wait for the update
                 * to complete before servicing KVM_REQ_APICV_UPDATE.
                 */
-               WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
+               WARN_ON_ONCE(kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu));
  
                exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
                if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
@@@ -10362,6 -10411,9 +10410,9 @@@ static int vcpu_run(struct kvm_vcpu *vc
                        break;
  
                kvm_clear_request(KVM_REQ_UNBLOCK, vcpu);
+               if (kvm_xen_has_pending_events(vcpu))
+                       kvm_xen_inject_pending_events(vcpu);
                if (kvm_cpu_has_pending_timer(vcpu))
                        kvm_inject_pending_timer_irqs(vcpu);
  
@@@ -11247,9 -11299,10 +11298,10 @@@ int kvm_arch_vcpu_create(struct kvm_vcp
  
        vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
        vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
+       kvm_xen_init_vcpu(vcpu);
        kvm_vcpu_mtrr_init(vcpu);
        vcpu_load(vcpu);
-       kvm_set_tsc_khz(vcpu, max_tsc_khz);
+       kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
        kvm_vcpu_reset(vcpu, false);
        kvm_init_mmu(vcpu);
        vcpu_put(vcpu);
@@@ -11304,6 -11357,7 +11356,7 @@@ void kvm_arch_vcpu_destroy(struct kvm_v
        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
        fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
  
+       kvm_xen_destroy_vcpu(vcpu);
        kvm_hv_vcpu_uninit(vcpu);
        kvm_pmu_destroy(vcpu);
        kfree(vcpu->arch.mce_banks);
@@@ -11696,6 -11750,7 +11749,7 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
        pvclock_update_vm_gtod_copy(kvm);
        raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
  
+       kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
        kvm->arch.guest_can_read_msr_platform_info = true;
        kvm->arch.enable_pmu = enable_pmu;
  
@@@ -12173,6 -12228,9 +12227,9 @@@ static inline bool kvm_vcpu_has_events(
            kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
                return true;
  
+       if (kvm_xen_has_pending_events(vcpu))
+               return true;
        return false;
  }
  
@@@ -12985,19 -13043,3 +13042,19 @@@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexi
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
 +
 +static int __init kvm_x86_init(void)
 +{
 +      kvm_mmu_x86_module_init();
 +      return 0;
 +}
 +module_init(kvm_x86_init);
 +
 +static void __exit kvm_x86_exit(void)
 +{
 +      /*
 +       * If module_init() is implemented, module_exit() must also be
 +       * implemented to allow module unload.
 +       */
 +}
 +module_exit(kvm_x86_exit);
@@@ -3,7 -3,6 +3,7 @@@
  /aarch64/debug-exceptions
  /aarch64/get-reg-list
  /aarch64/psci_cpu_on_test
 +/aarch64/vcpu_width_config
  /aarch64/vgic_init
  /aarch64/vgic_irq
  /s390x/memop
  /x86_64/debug_regs
  /x86_64/evmcs_test
  /x86_64/emulator_error_test
+ /x86_64/fix_hypercall_test
  /x86_64/get_msr_index_features
  /x86_64/kvm_clock_test
  /x86_64/kvm_pv_test
  /x86_64/hyperv_clock
  /x86_64/hyperv_cpuid
  /x86_64/hyperv_features
 +/x86_64/hyperv_svm_test
  /x86_64/mmio_warning_test
  /x86_64/mmu_role_test
  /x86_64/platform_info_test
@@@ -34,7 -33,6 +35,7 @@@
  /x86_64/state_test
  /x86_64/svm_vmcall_test
  /x86_64/svm_int_ctl_test
 +/x86_64/tsc_scaling_sync
  /x86_64/sync_regs_test
  /x86_64/tsc_msrs_test
  /x86_64/userspace_io_test
@@@ -48,6 -48,7 +48,7 @@@ TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpu
  TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
  TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
  TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test
+ TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test
  TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
  TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
  TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features
@@@ -65,6 -66,7 +66,7 @@@ TEST_GEN_PROGS_x86_64 += x86_64/state_t
  TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
  TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
  TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test
+ TEST_GEN_PROGS_x86_64 += x86_64/tsc_scaling_sync
  TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
  TEST_GEN_PROGS_x86_64 += x86_64/userspace_io_test
  TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test
@@@ -106,7 -108,6 +108,7 @@@ TEST_GEN_PROGS_aarch64 += aarch64/arch_
  TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
  TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
  TEST_GEN_PROGS_aarch64 += aarch64/psci_cpu_on_test
 +TEST_GEN_PROGS_aarch64 += aarch64/vcpu_width_config
  TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
  TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq
  TEST_GEN_PROGS_aarch64 += demand_paging_test
@@@ -154,7 -155,7 +156,7 @@@ endi
  CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \
        -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \
        -I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \
 -      -I$(<D) -Iinclude/$(UNAME_M) -I.. $(EXTRA_CFLAGS)
 +      -I$(<D) -Iinclude/$(UNAME_M) -I.. $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
  
  no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \
          $(CC) -Werror -no-pie -x c - -o "$$TMP", -no-pie)