KVM: nVMX: Use semi-colon instead of comma for exit-handlers initialization
[linux-2.6-microblaze.git] / arch / x86 / kvm / vmx / nested.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include <linux/frame.h>
4 #include <linux/percpu.h>
5
6 #include <asm/debugreg.h>
7 #include <asm/mmu_context.h>
8
9 #include "cpuid.h"
10 #include "hyperv.h"
11 #include "mmu.h"
12 #include "nested.h"
13 #include "pmu.h"
14 #include "trace.h"
15 #include "x86.h"
16
17 static bool __read_mostly enable_shadow_vmcs = 1;
18 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
19
20 static bool __read_mostly nested_early_check = 0;
21 module_param(nested_early_check, bool, S_IRUGO);
22
23 #define CC(consistency_check)                                           \
24 ({                                                                      \
25         bool failed = (consistency_check);                              \
26         if (failed)                                                     \
27                 trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
28         failed;                                                         \
29 })
30
31 #define SET_MSR_OR_WARN(vcpu, idx, data)                                \
32 ({                                                                      \
33         bool failed = kvm_set_msr(vcpu, idx, data);                     \
34         if (failed)                                                     \
35                 pr_warn_ratelimited(                                    \
36                                 "%s cannot write MSR (0x%x, 0x%llx)\n", \
37                                 __func__, idx, data);                   \
38         failed;                                                         \
39 })
40
41 /*
42  * Hyper-V requires all of these, so mark them as supported even though
43  * they are just treated the same as all-context.
44  */
45 #define VMX_VPID_EXTENT_SUPPORTED_MASK          \
46         (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |  \
47         VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |    \
48         VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |    \
49         VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
50
51 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
52
53 enum {
54         VMX_VMREAD_BITMAP,
55         VMX_VMWRITE_BITMAP,
56         VMX_BITMAP_NR
57 };
58 static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
59
60 #define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
61 #define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
62
63 struct shadow_vmcs_field {
64         u16     encoding;
65         u16     offset;
66 };
67 static struct shadow_vmcs_field shadow_read_only_fields[] = {
68 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
69 #include "vmcs_shadow_fields.h"
70 };
71 static int max_shadow_read_only_fields =
72         ARRAY_SIZE(shadow_read_only_fields);
73
74 static struct shadow_vmcs_field shadow_read_write_fields[] = {
75 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
76 #include "vmcs_shadow_fields.h"
77 };
78 static int max_shadow_read_write_fields =
79         ARRAY_SIZE(shadow_read_write_fields);
80
81 static void init_vmcs_shadow_fields(void)
82 {
83         int i, j;
84
85         memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
86         memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
87
88         for (i = j = 0; i < max_shadow_read_only_fields; i++) {
89                 struct shadow_vmcs_field entry = shadow_read_only_fields[i];
90                 u16 field = entry.encoding;
91
92                 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
93                     (i + 1 == max_shadow_read_only_fields ||
94                      shadow_read_only_fields[i + 1].encoding != field + 1))
95                         pr_err("Missing field from shadow_read_only_field %x\n",
96                                field + 1);
97
98                 clear_bit(field, vmx_vmread_bitmap);
99                 if (field & 1)
100 #ifdef CONFIG_X86_64
101                         continue;
102 #else
103                         entry.offset += sizeof(u32);
104 #endif
105                 shadow_read_only_fields[j++] = entry;
106         }
107         max_shadow_read_only_fields = j;
108
109         for (i = j = 0; i < max_shadow_read_write_fields; i++) {
110                 struct shadow_vmcs_field entry = shadow_read_write_fields[i];
111                 u16 field = entry.encoding;
112
113                 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
114                     (i + 1 == max_shadow_read_write_fields ||
115                      shadow_read_write_fields[i + 1].encoding != field + 1))
116                         pr_err("Missing field from shadow_read_write_field %x\n",
117                                field + 1);
118
119                 WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
120                           field <= GUEST_TR_AR_BYTES,
121                           "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
122
123                 /*
124                  * PML and the preemption timer can be emulated, but the
125                  * processor cannot vmwrite to fields that don't exist
126                  * on bare metal.
127                  */
128                 switch (field) {
129                 case GUEST_PML_INDEX:
130                         if (!cpu_has_vmx_pml())
131                                 continue;
132                         break;
133                 case VMX_PREEMPTION_TIMER_VALUE:
134                         if (!cpu_has_vmx_preemption_timer())
135                                 continue;
136                         break;
137                 case GUEST_INTR_STATUS:
138                         if (!cpu_has_vmx_apicv())
139                                 continue;
140                         break;
141                 default:
142                         break;
143                 }
144
145                 clear_bit(field, vmx_vmwrite_bitmap);
146                 clear_bit(field, vmx_vmread_bitmap);
147                 if (field & 1)
148 #ifdef CONFIG_X86_64
149                         continue;
150 #else
151                         entry.offset += sizeof(u32);
152 #endif
153                 shadow_read_write_fields[j++] = entry;
154         }
155         max_shadow_read_write_fields = j;
156 }
157
158 /*
159  * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
160  * set the success or error code of an emulated VMX instruction (as specified
161  * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
162  * instruction.
163  */
164 static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
165 {
166         vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
167                         & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
168                             X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
169         return kvm_skip_emulated_instruction(vcpu);
170 }
171
172 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
173 {
174         vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
175                         & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
176                             X86_EFLAGS_SF | X86_EFLAGS_OF))
177                         | X86_EFLAGS_CF);
178         return kvm_skip_emulated_instruction(vcpu);
179 }
180
181 static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
182                                 u32 vm_instruction_error)
183 {
184         struct vcpu_vmx *vmx = to_vmx(vcpu);
185
186         /*
187          * failValid writes the error number to the current VMCS, which
188          * can't be done if there isn't a current VMCS.
189          */
190         if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
191                 return nested_vmx_failInvalid(vcpu);
192
193         vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
194                         & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
195                             X86_EFLAGS_SF | X86_EFLAGS_OF))
196                         | X86_EFLAGS_ZF);
197         get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
198         /*
199          * We don't need to force a shadow sync because
200          * VM_INSTRUCTION_ERROR is not shadowed
201          */
202         return kvm_skip_emulated_instruction(vcpu);
203 }
204
205 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
206 {
207         /* TODO: not to reset guest simply here. */
208         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
209         pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
210 }
211
212 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
213 {
214         return fixed_bits_valid(control, low, high);
215 }
216
217 static inline u64 vmx_control_msr(u32 low, u32 high)
218 {
219         return low | ((u64)high << 32);
220 }
221
222 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
223 {
224         secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
225         vmcs_write64(VMCS_LINK_POINTER, -1ull);
226         vmx->nested.need_vmcs12_to_shadow_sync = false;
227 }
228
229 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
230 {
231         struct vcpu_vmx *vmx = to_vmx(vcpu);
232
233         if (!vmx->nested.hv_evmcs)
234                 return;
235
236         kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
237         vmx->nested.hv_evmcs_vmptr = -1ull;
238         vmx->nested.hv_evmcs = NULL;
239 }
240
241 /*
242  * Free whatever needs to be freed from vmx->nested when L1 goes down, or
243  * just stops using VMX.
244  */
245 static void free_nested(struct kvm_vcpu *vcpu)
246 {
247         struct vcpu_vmx *vmx = to_vmx(vcpu);
248
249         if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
250                 return;
251
252         kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
253
254         vmx->nested.vmxon = false;
255         vmx->nested.smm.vmxon = false;
256         free_vpid(vmx->nested.vpid02);
257         vmx->nested.posted_intr_nv = -1;
258         vmx->nested.current_vmptr = -1ull;
259         if (enable_shadow_vmcs) {
260                 vmx_disable_shadow_vmcs(vmx);
261                 vmcs_clear(vmx->vmcs01.shadow_vmcs);
262                 free_vmcs(vmx->vmcs01.shadow_vmcs);
263                 vmx->vmcs01.shadow_vmcs = NULL;
264         }
265         kfree(vmx->nested.cached_vmcs12);
266         vmx->nested.cached_vmcs12 = NULL;
267         kfree(vmx->nested.cached_shadow_vmcs12);
268         vmx->nested.cached_shadow_vmcs12 = NULL;
269         /* Unpin physical memory we referred to in the vmcs02 */
270         if (vmx->nested.apic_access_page) {
271                 kvm_release_page_dirty(vmx->nested.apic_access_page);
272                 vmx->nested.apic_access_page = NULL;
273         }
274         kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
275         kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
276         vmx->nested.pi_desc = NULL;
277
278         kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
279
280         nested_release_evmcs(vcpu);
281
282         free_loaded_vmcs(&vmx->nested.vmcs02);
283 }
284
285 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
286                                      struct loaded_vmcs *prev)
287 {
288         struct vmcs_host_state *dest, *src;
289
290         if (unlikely(!vmx->guest_state_loaded))
291                 return;
292
293         src = &prev->host_state;
294         dest = &vmx->loaded_vmcs->host_state;
295
296         vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
297         dest->ldt_sel = src->ldt_sel;
298 #ifdef CONFIG_X86_64
299         dest->ds_sel = src->ds_sel;
300         dest->es_sel = src->es_sel;
301 #endif
302 }
303
304 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
305 {
306         struct vcpu_vmx *vmx = to_vmx(vcpu);
307         struct loaded_vmcs *prev;
308         int cpu;
309
310         if (vmx->loaded_vmcs == vmcs)
311                 return;
312
313         cpu = get_cpu();
314         prev = vmx->loaded_vmcs;
315         vmx->loaded_vmcs = vmcs;
316         vmx_vcpu_load_vmcs(vcpu, cpu);
317         vmx_sync_vmcs_host_state(vmx, prev);
318         put_cpu();
319
320         vmx_segment_cache_clear(vmx);
321 }
322
323 /*
324  * Ensure that the current vmcs of the logical processor is the
325  * vmcs01 of the vcpu before calling free_nested().
326  */
327 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
328 {
329         vcpu_load(vcpu);
330         vmx_leave_nested(vcpu);
331         vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
332         free_nested(vcpu);
333         vcpu_put(vcpu);
334 }
335
336 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
337                 struct x86_exception *fault)
338 {
339         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
340         struct vcpu_vmx *vmx = to_vmx(vcpu);
341         u32 exit_reason;
342         unsigned long exit_qualification = vcpu->arch.exit_qualification;
343
344         if (vmx->nested.pml_full) {
345                 exit_reason = EXIT_REASON_PML_FULL;
346                 vmx->nested.pml_full = false;
347                 exit_qualification &= INTR_INFO_UNBLOCK_NMI;
348         } else if (fault->error_code & PFERR_RSVD_MASK)
349                 exit_reason = EXIT_REASON_EPT_MISCONFIG;
350         else
351                 exit_reason = EXIT_REASON_EPT_VIOLATION;
352
353         nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
354         vmcs12->guest_physical_address = fault->address;
355 }
356
357 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
358 {
359         WARN_ON(mmu_is_nested(vcpu));
360
361         vcpu->arch.mmu = &vcpu->arch.guest_mmu;
362         kvm_init_shadow_ept_mmu(vcpu,
363                         to_vmx(vcpu)->nested.msrs.ept_caps &
364                         VMX_EPT_EXECUTE_ONLY_BIT,
365                         nested_ept_ad_enabled(vcpu),
366                         nested_ept_get_cr3(vcpu));
367         vcpu->arch.mmu->set_cr3           = vmx_set_cr3;
368         vcpu->arch.mmu->get_cr3           = nested_ept_get_cr3;
369         vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
370         vcpu->arch.mmu->get_pdptr         = kvm_pdptr_read;
371
372         vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
373 }
374
375 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
376 {
377         vcpu->arch.mmu = &vcpu->arch.root_mmu;
378         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
379 }
380
381 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
382                                             u16 error_code)
383 {
384         bool inequality, bit;
385
386         bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
387         inequality =
388                 (error_code & vmcs12->page_fault_error_code_mask) !=
389                  vmcs12->page_fault_error_code_match;
390         return inequality ^ bit;
391 }
392
393
394 /*
395  * KVM wants to inject page-faults which it got to the guest. This function
396  * checks whether in a nested guest, we need to inject them to L1 or L2.
397  */
398 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
399 {
400         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
401         unsigned int nr = vcpu->arch.exception.nr;
402         bool has_payload = vcpu->arch.exception.has_payload;
403         unsigned long payload = vcpu->arch.exception.payload;
404
405         if (nr == PF_VECTOR) {
406                 if (vcpu->arch.exception.nested_apf) {
407                         *exit_qual = vcpu->arch.apf.nested_apf_token;
408                         return 1;
409                 }
410                 if (nested_vmx_is_page_fault_vmexit(vmcs12,
411                                                     vcpu->arch.exception.error_code)) {
412                         *exit_qual = has_payload ? payload : vcpu->arch.cr2;
413                         return 1;
414                 }
415         } else if (vmcs12->exception_bitmap & (1u << nr)) {
416                 if (nr == DB_VECTOR) {
417                         if (!has_payload) {
418                                 payload = vcpu->arch.dr6;
419                                 payload &= ~(DR6_FIXED_1 | DR6_BT);
420                                 payload ^= DR6_RTM;
421                         }
422                         *exit_qual = payload;
423                 } else
424                         *exit_qual = 0;
425                 return 1;
426         }
427
428         return 0;
429 }
430
431
432 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
433                 struct x86_exception *fault)
434 {
435         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
436
437         WARN_ON(!is_guest_mode(vcpu));
438
439         if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
440                 !to_vmx(vcpu)->nested.nested_run_pending) {
441                 vmcs12->vm_exit_intr_error_code = fault->error_code;
442                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
443                                   PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
444                                   INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
445                                   fault->address);
446         } else {
447                 kvm_inject_page_fault(vcpu, fault);
448         }
449 }
450
451 static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
452 {
453         return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
454 }
455
456 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
457                                                struct vmcs12 *vmcs12)
458 {
459         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
460                 return 0;
461
462         if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
463             CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
464                 return -EINVAL;
465
466         return 0;
467 }
468
469 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
470                                                 struct vmcs12 *vmcs12)
471 {
472         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
473                 return 0;
474
475         if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
476                 return -EINVAL;
477
478         return 0;
479 }
480
481 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
482                                                 struct vmcs12 *vmcs12)
483 {
484         if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
485                 return 0;
486
487         if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
488                 return -EINVAL;
489
490         return 0;
491 }
492
493 /*
494  * Check if MSR is intercepted for L01 MSR bitmap.
495  */
496 static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
497 {
498         unsigned long *msr_bitmap;
499         int f = sizeof(unsigned long);
500
501         if (!cpu_has_vmx_msr_bitmap())
502                 return true;
503
504         msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
505
506         if (msr <= 0x1fff) {
507                 return !!test_bit(msr, msr_bitmap + 0x800 / f);
508         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
509                 msr &= 0x1fff;
510                 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
511         }
512
513         return true;
514 }
515
516 /*
517  * If a msr is allowed by L0, we should check whether it is allowed by L1.
518  * The corresponding bit will be cleared unless both of L0 and L1 allow it.
519  */
520 static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
521                                                unsigned long *msr_bitmap_nested,
522                                                u32 msr, int type)
523 {
524         int f = sizeof(unsigned long);
525
526         /*
527          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
528          * have the write-low and read-high bitmap offsets the wrong way round.
529          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
530          */
531         if (msr <= 0x1fff) {
532                 if (type & MSR_TYPE_R &&
533                    !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
534                         /* read-low */
535                         __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
536
537                 if (type & MSR_TYPE_W &&
538                    !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
539                         /* write-low */
540                         __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
541
542         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
543                 msr &= 0x1fff;
544                 if (type & MSR_TYPE_R &&
545                    !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
546                         /* read-high */
547                         __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
548
549                 if (type & MSR_TYPE_W &&
550                    !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
551                         /* write-high */
552                         __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
553
554         }
555 }
556
557 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) {
558         int msr;
559
560         for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
561                 unsigned word = msr / BITS_PER_LONG;
562
563                 msr_bitmap[word] = ~0;
564                 msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
565         }
566 }
567
568 /*
569  * Merge L0's and L1's MSR bitmap, return false to indicate that
570  * we do not use the hardware.
571  */
572 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
573                                                  struct vmcs12 *vmcs12)
574 {
575         int msr;
576         unsigned long *msr_bitmap_l1;
577         unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
578         struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
579
580         /* Nothing to do if the MSR bitmap is not in use.  */
581         if (!cpu_has_vmx_msr_bitmap() ||
582             !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
583                 return false;
584
585         if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
586                 return false;
587
588         msr_bitmap_l1 = (unsigned long *)map->hva;
589
590         /*
591          * To keep the control flow simple, pay eight 8-byte writes (sixteen
592          * 4-byte writes on 32-bit systems) up front to enable intercepts for
593          * the x2APIC MSR range and selectively disable them below.
594          */
595         enable_x2apic_msr_intercepts(msr_bitmap_l0);
596
597         if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
598                 if (nested_cpu_has_apic_reg_virt(vmcs12)) {
599                         /*
600                          * L0 need not intercept reads for MSRs between 0x800
601                          * and 0x8ff, it just lets the processor take the value
602                          * from the virtual-APIC page; take those 256 bits
603                          * directly from the L1 bitmap.
604                          */
605                         for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
606                                 unsigned word = msr / BITS_PER_LONG;
607
608                                 msr_bitmap_l0[word] = msr_bitmap_l1[word];
609                         }
610                 }
611
612                 nested_vmx_disable_intercept_for_msr(
613                         msr_bitmap_l1, msr_bitmap_l0,
614                         X2APIC_MSR(APIC_TASKPRI),
615                         MSR_TYPE_R | MSR_TYPE_W);
616
617                 if (nested_cpu_has_vid(vmcs12)) {
618                         nested_vmx_disable_intercept_for_msr(
619                                 msr_bitmap_l1, msr_bitmap_l0,
620                                 X2APIC_MSR(APIC_EOI),
621                                 MSR_TYPE_W);
622                         nested_vmx_disable_intercept_for_msr(
623                                 msr_bitmap_l1, msr_bitmap_l0,
624                                 X2APIC_MSR(APIC_SELF_IPI),
625                                 MSR_TYPE_W);
626                 }
627         }
628
629         /* KVM unconditionally exposes the FS/GS base MSRs to L1. */
630         nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
631                                              MSR_FS_BASE, MSR_TYPE_RW);
632
633         nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
634                                              MSR_GS_BASE, MSR_TYPE_RW);
635
636         nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
637                                              MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
638
639         /*
640          * Checking the L0->L1 bitmap is trying to verify two things:
641          *
642          * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
643          *    ensures that we do not accidentally generate an L02 MSR bitmap
644          *    from the L12 MSR bitmap that is too permissive.
645          * 2. That L1 or L2s have actually used the MSR. This avoids
646          *    unnecessarily merging of the bitmap if the MSR is unused. This
647          *    works properly because we only update the L01 MSR bitmap lazily.
648          *    So even if L0 should pass L1 these MSRs, the L01 bitmap is only
649          *    updated to reflect this when L1 (or its L2s) actually write to
650          *    the MSR.
651          */
652         if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
653                 nested_vmx_disable_intercept_for_msr(
654                                         msr_bitmap_l1, msr_bitmap_l0,
655                                         MSR_IA32_SPEC_CTRL,
656                                         MSR_TYPE_R | MSR_TYPE_W);
657
658         if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
659                 nested_vmx_disable_intercept_for_msr(
660                                         msr_bitmap_l1, msr_bitmap_l0,
661                                         MSR_IA32_PRED_CMD,
662                                         MSR_TYPE_W);
663
664         kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
665
666         return true;
667 }
668
669 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
670                                        struct vmcs12 *vmcs12)
671 {
672         struct kvm_host_map map;
673         struct vmcs12 *shadow;
674
675         if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
676             vmcs12->vmcs_link_pointer == -1ull)
677                 return;
678
679         shadow = get_shadow_vmcs12(vcpu);
680
681         if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
682                 return;
683
684         memcpy(shadow, map.hva, VMCS12_SIZE);
685         kvm_vcpu_unmap(vcpu, &map, false);
686 }
687
688 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
689                                               struct vmcs12 *vmcs12)
690 {
691         struct vcpu_vmx *vmx = to_vmx(vcpu);
692
693         if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
694             vmcs12->vmcs_link_pointer == -1ull)
695                 return;
696
697         kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
698                         get_shadow_vmcs12(vcpu), VMCS12_SIZE);
699 }
700
701 /*
702  * In nested virtualization, check if L1 has set
703  * VM_EXIT_ACK_INTR_ON_EXIT
704  */
705 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
706 {
707         return get_vmcs12(vcpu)->vm_exit_controls &
708                 VM_EXIT_ACK_INTR_ON_EXIT;
709 }
710
711 static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
712 {
713         return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
714 }
715
716 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
717                                           struct vmcs12 *vmcs12)
718 {
719         if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
720             CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
721                 return -EINVAL;
722         else
723                 return 0;
724 }
725
726 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
727                                            struct vmcs12 *vmcs12)
728 {
729         if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
730             !nested_cpu_has_apic_reg_virt(vmcs12) &&
731             !nested_cpu_has_vid(vmcs12) &&
732             !nested_cpu_has_posted_intr(vmcs12))
733                 return 0;
734
735         /*
736          * If virtualize x2apic mode is enabled,
737          * virtualize apic access must be disabled.
738          */
739         if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
740                nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
741                 return -EINVAL;
742
743         /*
744          * If virtual interrupt delivery is enabled,
745          * we must exit on external interrupts.
746          */
747         if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
748                 return -EINVAL;
749
750         /*
751          * bits 15:8 should be zero in posted_intr_nv,
752          * the descriptor address has been already checked
753          * in nested_get_vmcs12_pages.
754          *
755          * bits 5:0 of posted_intr_desc_addr should be zero.
756          */
757         if (nested_cpu_has_posted_intr(vmcs12) &&
758            (CC(!nested_cpu_has_vid(vmcs12)) ||
759             CC(!nested_exit_intr_ack_set(vcpu)) ||
760             CC((vmcs12->posted_intr_nv & 0xff00)) ||
761             CC((vmcs12->posted_intr_desc_addr & 0x3f)) ||
762             CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))))
763                 return -EINVAL;
764
765         /* tpr shadow is needed by all apicv features. */
766         if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
767                 return -EINVAL;
768
769         return 0;
770 }
771
772 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
773                                        u32 count, u64 addr)
774 {
775         int maxphyaddr;
776
777         if (count == 0)
778                 return 0;
779         maxphyaddr = cpuid_maxphyaddr(vcpu);
780         if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
781             (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr)
782                 return -EINVAL;
783
784         return 0;
785 }
786
787 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
788                                                      struct vmcs12 *vmcs12)
789 {
790         if (CC(nested_vmx_check_msr_switch(vcpu,
791                                            vmcs12->vm_exit_msr_load_count,
792                                            vmcs12->vm_exit_msr_load_addr)) ||
793             CC(nested_vmx_check_msr_switch(vcpu,
794                                            vmcs12->vm_exit_msr_store_count,
795                                            vmcs12->vm_exit_msr_store_addr)))
796                 return -EINVAL;
797
798         return 0;
799 }
800
801 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
802                                                       struct vmcs12 *vmcs12)
803 {
804         if (CC(nested_vmx_check_msr_switch(vcpu,
805                                            vmcs12->vm_entry_msr_load_count,
806                                            vmcs12->vm_entry_msr_load_addr)))
807                 return -EINVAL;
808
809         return 0;
810 }
811
812 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
813                                          struct vmcs12 *vmcs12)
814 {
815         if (!nested_cpu_has_pml(vmcs12))
816                 return 0;
817
818         if (CC(!nested_cpu_has_ept(vmcs12)) ||
819             CC(!page_address_valid(vcpu, vmcs12->pml_address)))
820                 return -EINVAL;
821
822         return 0;
823 }
824
825 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
826                                                         struct vmcs12 *vmcs12)
827 {
828         if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
829                !nested_cpu_has_ept(vmcs12)))
830                 return -EINVAL;
831         return 0;
832 }
833
834 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
835                                                          struct vmcs12 *vmcs12)
836 {
837         if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
838                !nested_cpu_has_ept(vmcs12)))
839                 return -EINVAL;
840         return 0;
841 }
842
843 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
844                                                  struct vmcs12 *vmcs12)
845 {
846         if (!nested_cpu_has_shadow_vmcs(vmcs12))
847                 return 0;
848
849         if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
850             CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
851                 return -EINVAL;
852
853         return 0;
854 }
855
856 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
857                                        struct vmx_msr_entry *e)
858 {
859         /* x2APIC MSR accesses are not allowed */
860         if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
861                 return -EINVAL;
862         if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
863             CC(e->index == MSR_IA32_UCODE_REV))
864                 return -EINVAL;
865         if (CC(e->reserved != 0))
866                 return -EINVAL;
867         return 0;
868 }
869
870 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
871                                      struct vmx_msr_entry *e)
872 {
873         if (CC(e->index == MSR_FS_BASE) ||
874             CC(e->index == MSR_GS_BASE) ||
875             CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
876             nested_vmx_msr_check_common(vcpu, e))
877                 return -EINVAL;
878         return 0;
879 }
880
881 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
882                                       struct vmx_msr_entry *e)
883 {
884         if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
885             nested_vmx_msr_check_common(vcpu, e))
886                 return -EINVAL;
887         return 0;
888 }
889
890 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
891 {
892         struct vcpu_vmx *vmx = to_vmx(vcpu);
893         u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
894                                        vmx->nested.msrs.misc_high);
895
896         return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
897 }
898
899 /*
900  * Load guest's/host's msr at nested entry/exit.
901  * return 0 for success, entry index for failure.
902  *
903  * One of the failure modes for MSR load/store is when a list exceeds the
904  * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
905  * as possible, process all valid entries before failing rather than precheck
906  * for a capacity violation.
907  */
908 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
909 {
910         u32 i;
911         struct vmx_msr_entry e;
912         u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
913
914         for (i = 0; i < count; i++) {
915                 if (unlikely(i >= max_msr_list_size))
916                         goto fail;
917
918                 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
919                                         &e, sizeof(e))) {
920                         pr_debug_ratelimited(
921                                 "%s cannot read MSR entry (%u, 0x%08llx)\n",
922                                 __func__, i, gpa + i * sizeof(e));
923                         goto fail;
924                 }
925                 if (nested_vmx_load_msr_check(vcpu, &e)) {
926                         pr_debug_ratelimited(
927                                 "%s check failed (%u, 0x%x, 0x%x)\n",
928                                 __func__, i, e.index, e.reserved);
929                         goto fail;
930                 }
931                 if (kvm_set_msr(vcpu, e.index, e.value)) {
932                         pr_debug_ratelimited(
933                                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
934                                 __func__, i, e.index, e.value);
935                         goto fail;
936                 }
937         }
938         return 0;
939 fail:
940         return i + 1;
941 }
942
943 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
944                                             u32 msr_index,
945                                             u64 *data)
946 {
947         struct vcpu_vmx *vmx = to_vmx(vcpu);
948
949         /*
950          * If the L0 hypervisor stored a more accurate value for the TSC that
951          * does not include the time taken for emulation of the L2->L1
952          * VM-exit in L0, use the more accurate value.
953          */
954         if (msr_index == MSR_IA32_TSC) {
955                 int index = vmx_find_msr_index(&vmx->msr_autostore.guest,
956                                                MSR_IA32_TSC);
957
958                 if (index >= 0) {
959                         u64 val = vmx->msr_autostore.guest.val[index].value;
960
961                         *data = kvm_read_l1_tsc(vcpu, val);
962                         return true;
963                 }
964         }
965
966         if (kvm_get_msr(vcpu, msr_index, data)) {
967                 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
968                         msr_index);
969                 return false;
970         }
971         return true;
972 }
973
974 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
975                                      struct vmx_msr_entry *e)
976 {
977         if (kvm_vcpu_read_guest(vcpu,
978                                 gpa + i * sizeof(*e),
979                                 e, 2 * sizeof(u32))) {
980                 pr_debug_ratelimited(
981                         "%s cannot read MSR entry (%u, 0x%08llx)\n",
982                         __func__, i, gpa + i * sizeof(*e));
983                 return false;
984         }
985         if (nested_vmx_store_msr_check(vcpu, e)) {
986                 pr_debug_ratelimited(
987                         "%s check failed (%u, 0x%x, 0x%x)\n",
988                         __func__, i, e->index, e->reserved);
989                 return false;
990         }
991         return true;
992 }
993
994 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
995 {
996         u64 data;
997         u32 i;
998         struct vmx_msr_entry e;
999         u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
1000
1001         for (i = 0; i < count; i++) {
1002                 if (unlikely(i >= max_msr_list_size))
1003                         return -EINVAL;
1004
1005                 if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1006                         return -EINVAL;
1007
1008                 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
1009                         return -EINVAL;
1010
1011                 if (kvm_vcpu_write_guest(vcpu,
1012                                          gpa + i * sizeof(e) +
1013                                              offsetof(struct vmx_msr_entry, value),
1014                                          &data, sizeof(data))) {
1015                         pr_debug_ratelimited(
1016                                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
1017                                 __func__, i, e.index, data);
1018                         return -EINVAL;
1019                 }
1020         }
1021         return 0;
1022 }
1023
1024 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
1025 {
1026         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1027         u32 count = vmcs12->vm_exit_msr_store_count;
1028         u64 gpa = vmcs12->vm_exit_msr_store_addr;
1029         struct vmx_msr_entry e;
1030         u32 i;
1031
1032         for (i = 0; i < count; i++) {
1033                 if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1034                         return false;
1035
1036                 if (e.index == msr_index)
1037                         return true;
1038         }
1039         return false;
1040 }
1041
1042 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
1043                                            u32 msr_index)
1044 {
1045         struct vcpu_vmx *vmx = to_vmx(vcpu);
1046         struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
1047         bool in_vmcs12_store_list;
1048         int msr_autostore_index;
1049         bool in_autostore_list;
1050         int last;
1051
1052         msr_autostore_index = vmx_find_msr_index(autostore, msr_index);
1053         in_autostore_list = msr_autostore_index >= 0;
1054         in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
1055
1056         if (in_vmcs12_store_list && !in_autostore_list) {
1057                 if (autostore->nr == NR_LOADSTORE_MSRS) {
1058                         /*
1059                          * Emulated VMEntry does not fail here.  Instead a less
1060                          * accurate value will be returned by
1061                          * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
1062                          * instead of reading the value from the vmcs02 VMExit
1063                          * MSR-store area.
1064                          */
1065                         pr_warn_ratelimited(
1066                                 "Not enough msr entries in msr_autostore.  Can't add msr %x\n",
1067                                 msr_index);
1068                         return;
1069                 }
1070                 last = autostore->nr++;
1071                 autostore->val[last].index = msr_index;
1072         } else if (!in_vmcs12_store_list && in_autostore_list) {
1073                 last = --autostore->nr;
1074                 autostore->val[msr_autostore_index] = autostore->val[last];
1075         }
1076 }
1077
1078 static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
1079 {
1080         unsigned long invalid_mask;
1081
1082         invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
1083         return (val & invalid_mask) == 0;
1084 }
1085
1086 /*
1087  * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
1088  * emulating VM entry into a guest with EPT enabled.
1089  * Returns 0 on success, 1 on failure. Invalid state exit qualification code
1090  * is assigned to entry_failure_code on failure.
1091  */
1092 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
1093                                u32 *entry_failure_code)
1094 {
1095         if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
1096                 if (CC(!nested_cr3_valid(vcpu, cr3))) {
1097                         *entry_failure_code = ENTRY_FAIL_DEFAULT;
1098                         return -EINVAL;
1099                 }
1100
1101                 /*
1102                  * If PAE paging and EPT are both on, CR3 is not used by the CPU and
1103                  * must not be dereferenced.
1104                  */
1105                 if (is_pae_paging(vcpu) && !nested_ept) {
1106                         if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
1107                                 *entry_failure_code = ENTRY_FAIL_PDPTE;
1108                                 return -EINVAL;
1109                         }
1110                 }
1111         }
1112
1113         if (!nested_ept)
1114                 kvm_mmu_new_cr3(vcpu, cr3, false);
1115
1116         vcpu->arch.cr3 = cr3;
1117         kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
1118
1119         kvm_init_mmu(vcpu, false);
1120
1121         return 0;
1122 }
1123
1124 /*
1125  * Returns if KVM is able to config CPU to tag TLB entries
1126  * populated by L2 differently than TLB entries populated
1127  * by L1.
1128  *
1129  * If L1 uses EPT, then TLB entries are tagged with different EPTP.
1130  *
1131  * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
1132  * with different VPID (L1 entries are tagged with vmx->vpid
1133  * while L2 entries are tagged with vmx->nested.vpid02).
1134  */
1135 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
1136 {
1137         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1138
1139         return nested_cpu_has_ept(vmcs12) ||
1140                (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
1141 }
1142
1143 static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
1144 {
1145         struct vcpu_vmx *vmx = to_vmx(vcpu);
1146
1147         return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
1148 }
1149
1150 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
1151 {
1152         superset &= mask;
1153         subset &= mask;
1154
1155         return (superset | subset) == superset;
1156 }
1157
1158 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
1159 {
1160         const u64 feature_and_reserved =
1161                 /* feature (except bit 48; see below) */
1162                 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
1163                 /* reserved */
1164                 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
1165         u64 vmx_basic = vmx->nested.msrs.basic;
1166
1167         if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
1168                 return -EINVAL;
1169
1170         /*
1171          * KVM does not emulate a version of VMX that constrains physical
1172          * addresses of VMX structures (e.g. VMCS) to 32-bits.
1173          */
1174         if (data & BIT_ULL(48))
1175                 return -EINVAL;
1176
1177         if (vmx_basic_vmcs_revision_id(vmx_basic) !=
1178             vmx_basic_vmcs_revision_id(data))
1179                 return -EINVAL;
1180
1181         if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
1182                 return -EINVAL;
1183
1184         vmx->nested.msrs.basic = data;
1185         return 0;
1186 }
1187
1188 static int
1189 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1190 {
1191         u64 supported;
1192         u32 *lowp, *highp;
1193
1194         switch (msr_index) {
1195         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1196                 lowp = &vmx->nested.msrs.pinbased_ctls_low;
1197                 highp = &vmx->nested.msrs.pinbased_ctls_high;
1198                 break;
1199         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1200                 lowp = &vmx->nested.msrs.procbased_ctls_low;
1201                 highp = &vmx->nested.msrs.procbased_ctls_high;
1202                 break;
1203         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1204                 lowp = &vmx->nested.msrs.exit_ctls_low;
1205                 highp = &vmx->nested.msrs.exit_ctls_high;
1206                 break;
1207         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1208                 lowp = &vmx->nested.msrs.entry_ctls_low;
1209                 highp = &vmx->nested.msrs.entry_ctls_high;
1210                 break;
1211         case MSR_IA32_VMX_PROCBASED_CTLS2:
1212                 lowp = &vmx->nested.msrs.secondary_ctls_low;
1213                 highp = &vmx->nested.msrs.secondary_ctls_high;
1214                 break;
1215         default:
1216                 BUG();
1217         }
1218
1219         supported = vmx_control_msr(*lowp, *highp);
1220
1221         /* Check must-be-1 bits are still 1. */
1222         if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
1223                 return -EINVAL;
1224
1225         /* Check must-be-0 bits are still 0. */
1226         if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
1227                 return -EINVAL;
1228
1229         *lowp = data;
1230         *highp = data >> 32;
1231         return 0;
1232 }
1233
1234 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1235 {
1236         const u64 feature_and_reserved_bits =
1237                 /* feature */
1238                 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
1239                 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
1240                 /* reserved */
1241                 GENMASK_ULL(13, 9) | BIT_ULL(31);
1242         u64 vmx_misc;
1243
1244         vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
1245                                    vmx->nested.msrs.misc_high);
1246
1247         if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
1248                 return -EINVAL;
1249
1250         if ((vmx->nested.msrs.pinbased_ctls_high &
1251              PIN_BASED_VMX_PREEMPTION_TIMER) &&
1252             vmx_misc_preemption_timer_rate(data) !=
1253             vmx_misc_preemption_timer_rate(vmx_misc))
1254                 return -EINVAL;
1255
1256         if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
1257                 return -EINVAL;
1258
1259         if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
1260                 return -EINVAL;
1261
1262         if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
1263                 return -EINVAL;
1264
1265         vmx->nested.msrs.misc_low = data;
1266         vmx->nested.msrs.misc_high = data >> 32;
1267
1268         return 0;
1269 }
1270
1271 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
1272 {
1273         u64 vmx_ept_vpid_cap;
1274
1275         vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
1276                                            vmx->nested.msrs.vpid_caps);
1277
1278         /* Every bit is either reserved or a feature bit. */
1279         if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
1280                 return -EINVAL;
1281
1282         vmx->nested.msrs.ept_caps = data;
1283         vmx->nested.msrs.vpid_caps = data >> 32;
1284         return 0;
1285 }
1286
1287 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1288 {
1289         u64 *msr;
1290
1291         switch (msr_index) {
1292         case MSR_IA32_VMX_CR0_FIXED0:
1293                 msr = &vmx->nested.msrs.cr0_fixed0;
1294                 break;
1295         case MSR_IA32_VMX_CR4_FIXED0:
1296                 msr = &vmx->nested.msrs.cr4_fixed0;
1297                 break;
1298         default:
1299                 BUG();
1300         }
1301
1302         /*
1303          * 1 bits (which indicates bits which "must-be-1" during VMX operation)
1304          * must be 1 in the restored value.
1305          */
1306         if (!is_bitwise_subset(data, *msr, -1ULL))
1307                 return -EINVAL;
1308
1309         *msr = data;
1310         return 0;
1311 }
1312
1313 /*
1314  * Called when userspace is restoring VMX MSRs.
1315  *
1316  * Returns 0 on success, non-0 otherwise.
1317  */
1318 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1319 {
1320         struct vcpu_vmx *vmx = to_vmx(vcpu);
1321
1322         /*
1323          * Don't allow changes to the VMX capability MSRs while the vCPU
1324          * is in VMX operation.
1325          */
1326         if (vmx->nested.vmxon)
1327                 return -EBUSY;
1328
1329         switch (msr_index) {
1330         case MSR_IA32_VMX_BASIC:
1331                 return vmx_restore_vmx_basic(vmx, data);
1332         case MSR_IA32_VMX_PINBASED_CTLS:
1333         case MSR_IA32_VMX_PROCBASED_CTLS:
1334         case MSR_IA32_VMX_EXIT_CTLS:
1335         case MSR_IA32_VMX_ENTRY_CTLS:
1336                 /*
1337                  * The "non-true" VMX capability MSRs are generated from the
1338                  * "true" MSRs, so we do not support restoring them directly.
1339                  *
1340                  * If userspace wants to emulate VMX_BASIC[55]=0, userspace
1341                  * should restore the "true" MSRs with the must-be-1 bits
1342                  * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1343                  * DEFAULT SETTINGS".
1344                  */
1345                 return -EINVAL;
1346         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1347         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1348         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1349         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1350         case MSR_IA32_VMX_PROCBASED_CTLS2:
1351                 return vmx_restore_control_msr(vmx, msr_index, data);
1352         case MSR_IA32_VMX_MISC:
1353                 return vmx_restore_vmx_misc(vmx, data);
1354         case MSR_IA32_VMX_CR0_FIXED0:
1355         case MSR_IA32_VMX_CR4_FIXED0:
1356                 return vmx_restore_fixed0_msr(vmx, msr_index, data);
1357         case MSR_IA32_VMX_CR0_FIXED1:
1358         case MSR_IA32_VMX_CR4_FIXED1:
1359                 /*
1360                  * These MSRs are generated based on the vCPU's CPUID, so we
1361                  * do not support restoring them directly.
1362                  */
1363                 return -EINVAL;
1364         case MSR_IA32_VMX_EPT_VPID_CAP:
1365                 return vmx_restore_vmx_ept_vpid_cap(vmx, data);
1366         case MSR_IA32_VMX_VMCS_ENUM:
1367                 vmx->nested.msrs.vmcs_enum = data;
1368                 return 0;
1369         case MSR_IA32_VMX_VMFUNC:
1370                 if (data & ~vmx->nested.msrs.vmfunc_controls)
1371                         return -EINVAL;
1372                 vmx->nested.msrs.vmfunc_controls = data;
1373                 return 0;
1374         default:
1375                 /*
1376                  * The rest of the VMX capability MSRs do not support restore.
1377                  */
1378                 return -EINVAL;
1379         }
1380 }
1381
1382 /* Returns 0 on success, non-0 otherwise. */
1383 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1384 {
1385         switch (msr_index) {
1386         case MSR_IA32_VMX_BASIC:
1387                 *pdata = msrs->basic;
1388                 break;
1389         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1390         case MSR_IA32_VMX_PINBASED_CTLS:
1391                 *pdata = vmx_control_msr(
1392                         msrs->pinbased_ctls_low,
1393                         msrs->pinbased_ctls_high);
1394                 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
1395                         *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1396                 break;
1397         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1398         case MSR_IA32_VMX_PROCBASED_CTLS:
1399                 *pdata = vmx_control_msr(
1400                         msrs->procbased_ctls_low,
1401                         msrs->procbased_ctls_high);
1402                 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
1403                         *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1404                 break;
1405         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1406         case MSR_IA32_VMX_EXIT_CTLS:
1407                 *pdata = vmx_control_msr(
1408                         msrs->exit_ctls_low,
1409                         msrs->exit_ctls_high);
1410                 if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
1411                         *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
1412                 break;
1413         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1414         case MSR_IA32_VMX_ENTRY_CTLS:
1415                 *pdata = vmx_control_msr(
1416                         msrs->entry_ctls_low,
1417                         msrs->entry_ctls_high);
1418                 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
1419                         *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
1420                 break;
1421         case MSR_IA32_VMX_MISC:
1422                 *pdata = vmx_control_msr(
1423                         msrs->misc_low,
1424                         msrs->misc_high);
1425                 break;
1426         case MSR_IA32_VMX_CR0_FIXED0:
1427                 *pdata = msrs->cr0_fixed0;
1428                 break;
1429         case MSR_IA32_VMX_CR0_FIXED1:
1430                 *pdata = msrs->cr0_fixed1;
1431                 break;
1432         case MSR_IA32_VMX_CR4_FIXED0:
1433                 *pdata = msrs->cr4_fixed0;
1434                 break;
1435         case MSR_IA32_VMX_CR4_FIXED1:
1436                 *pdata = msrs->cr4_fixed1;
1437                 break;
1438         case MSR_IA32_VMX_VMCS_ENUM:
1439                 *pdata = msrs->vmcs_enum;
1440                 break;
1441         case MSR_IA32_VMX_PROCBASED_CTLS2:
1442                 *pdata = vmx_control_msr(
1443                         msrs->secondary_ctls_low,
1444                         msrs->secondary_ctls_high);
1445                 break;
1446         case MSR_IA32_VMX_EPT_VPID_CAP:
1447                 *pdata = msrs->ept_caps |
1448                         ((u64)msrs->vpid_caps << 32);
1449                 break;
1450         case MSR_IA32_VMX_VMFUNC:
1451                 *pdata = msrs->vmfunc_controls;
1452                 break;
1453         default:
1454                 return 1;
1455         }
1456
1457         return 0;
1458 }
1459
1460 /*
1461  * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
1462  * been modified by the L1 guest.  Note, "writable" in this context means
1463  * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
1464  * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
1465  * VM-exit information fields (which are actually writable if the vCPU is
1466  * configured to support "VMWRITE to any supported field in the VMCS").
1467  */
1468 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1469 {
1470         struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1471         struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1472         struct shadow_vmcs_field field;
1473         unsigned long val;
1474         int i;
1475
1476         if (WARN_ON(!shadow_vmcs))
1477                 return;
1478
1479         preempt_disable();
1480
1481         vmcs_load(shadow_vmcs);
1482
1483         for (i = 0; i < max_shadow_read_write_fields; i++) {
1484                 field = shadow_read_write_fields[i];
1485                 val = __vmcs_readl(field.encoding);
1486                 vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
1487         }
1488
1489         vmcs_clear(shadow_vmcs);
1490         vmcs_load(vmx->loaded_vmcs->vmcs);
1491
1492         preempt_enable();
1493 }
1494
1495 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1496 {
1497         const struct shadow_vmcs_field *fields[] = {
1498                 shadow_read_write_fields,
1499                 shadow_read_only_fields
1500         };
1501         const int max_fields[] = {
1502                 max_shadow_read_write_fields,
1503                 max_shadow_read_only_fields
1504         };
1505         struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1506         struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1507         struct shadow_vmcs_field field;
1508         unsigned long val;
1509         int i, q;
1510
1511         if (WARN_ON(!shadow_vmcs))
1512                 return;
1513
1514         vmcs_load(shadow_vmcs);
1515
1516         for (q = 0; q < ARRAY_SIZE(fields); q++) {
1517                 for (i = 0; i < max_fields[q]; i++) {
1518                         field = fields[q][i];
1519                         val = vmcs12_read_any(vmcs12, field.encoding,
1520                                               field.offset);
1521                         __vmcs_writel(field.encoding, val);
1522                 }
1523         }
1524
1525         vmcs_clear(shadow_vmcs);
1526         vmcs_load(vmx->loaded_vmcs->vmcs);
1527 }
1528
1529 static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
1530 {
1531         struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1532         struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1533
1534         /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1535         vmcs12->tpr_threshold = evmcs->tpr_threshold;
1536         vmcs12->guest_rip = evmcs->guest_rip;
1537
1538         if (unlikely(!(evmcs->hv_clean_fields &
1539                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
1540                 vmcs12->guest_rsp = evmcs->guest_rsp;
1541                 vmcs12->guest_rflags = evmcs->guest_rflags;
1542                 vmcs12->guest_interruptibility_info =
1543                         evmcs->guest_interruptibility_info;
1544         }
1545
1546         if (unlikely(!(evmcs->hv_clean_fields &
1547                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1548                 vmcs12->cpu_based_vm_exec_control =
1549                         evmcs->cpu_based_vm_exec_control;
1550         }
1551
1552         if (unlikely(!(evmcs->hv_clean_fields &
1553                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
1554                 vmcs12->exception_bitmap = evmcs->exception_bitmap;
1555         }
1556
1557         if (unlikely(!(evmcs->hv_clean_fields &
1558                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
1559                 vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
1560         }
1561
1562         if (unlikely(!(evmcs->hv_clean_fields &
1563                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
1564                 vmcs12->vm_entry_intr_info_field =
1565                         evmcs->vm_entry_intr_info_field;
1566                 vmcs12->vm_entry_exception_error_code =
1567                         evmcs->vm_entry_exception_error_code;
1568                 vmcs12->vm_entry_instruction_len =
1569                         evmcs->vm_entry_instruction_len;
1570         }
1571
1572         if (unlikely(!(evmcs->hv_clean_fields &
1573                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1574                 vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
1575                 vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
1576                 vmcs12->host_cr0 = evmcs->host_cr0;
1577                 vmcs12->host_cr3 = evmcs->host_cr3;
1578                 vmcs12->host_cr4 = evmcs->host_cr4;
1579                 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
1580                 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
1581                 vmcs12->host_rip = evmcs->host_rip;
1582                 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
1583                 vmcs12->host_es_selector = evmcs->host_es_selector;
1584                 vmcs12->host_cs_selector = evmcs->host_cs_selector;
1585                 vmcs12->host_ss_selector = evmcs->host_ss_selector;
1586                 vmcs12->host_ds_selector = evmcs->host_ds_selector;
1587                 vmcs12->host_fs_selector = evmcs->host_fs_selector;
1588                 vmcs12->host_gs_selector = evmcs->host_gs_selector;
1589                 vmcs12->host_tr_selector = evmcs->host_tr_selector;
1590         }
1591
1592         if (unlikely(!(evmcs->hv_clean_fields &
1593                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
1594                 vmcs12->pin_based_vm_exec_control =
1595                         evmcs->pin_based_vm_exec_control;
1596                 vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
1597                 vmcs12->secondary_vm_exec_control =
1598                         evmcs->secondary_vm_exec_control;
1599         }
1600
1601         if (unlikely(!(evmcs->hv_clean_fields &
1602                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
1603                 vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
1604                 vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
1605         }
1606
1607         if (unlikely(!(evmcs->hv_clean_fields &
1608                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
1609                 vmcs12->msr_bitmap = evmcs->msr_bitmap;
1610         }
1611
1612         if (unlikely(!(evmcs->hv_clean_fields &
1613                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
1614                 vmcs12->guest_es_base = evmcs->guest_es_base;
1615                 vmcs12->guest_cs_base = evmcs->guest_cs_base;
1616                 vmcs12->guest_ss_base = evmcs->guest_ss_base;
1617                 vmcs12->guest_ds_base = evmcs->guest_ds_base;
1618                 vmcs12->guest_fs_base = evmcs->guest_fs_base;
1619                 vmcs12->guest_gs_base = evmcs->guest_gs_base;
1620                 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
1621                 vmcs12->guest_tr_base = evmcs->guest_tr_base;
1622                 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
1623                 vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
1624                 vmcs12->guest_es_limit = evmcs->guest_es_limit;
1625                 vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
1626                 vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
1627                 vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
1628                 vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
1629                 vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
1630                 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
1631                 vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
1632                 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
1633                 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
1634                 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
1635                 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
1636                 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
1637                 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
1638                 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
1639                 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
1640                 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
1641                 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
1642                 vmcs12->guest_es_selector = evmcs->guest_es_selector;
1643                 vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
1644                 vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
1645                 vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
1646                 vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
1647                 vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
1648                 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
1649                 vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
1650         }
1651
1652         if (unlikely(!(evmcs->hv_clean_fields &
1653                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
1654                 vmcs12->tsc_offset = evmcs->tsc_offset;
1655                 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
1656                 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
1657         }
1658
1659         if (unlikely(!(evmcs->hv_clean_fields &
1660                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
1661                 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
1662                 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
1663                 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
1664                 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
1665                 vmcs12->guest_cr0 = evmcs->guest_cr0;
1666                 vmcs12->guest_cr3 = evmcs->guest_cr3;
1667                 vmcs12->guest_cr4 = evmcs->guest_cr4;
1668                 vmcs12->guest_dr7 = evmcs->guest_dr7;
1669         }
1670
1671         if (unlikely(!(evmcs->hv_clean_fields &
1672                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
1673                 vmcs12->host_fs_base = evmcs->host_fs_base;
1674                 vmcs12->host_gs_base = evmcs->host_gs_base;
1675                 vmcs12->host_tr_base = evmcs->host_tr_base;
1676                 vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
1677                 vmcs12->host_idtr_base = evmcs->host_idtr_base;
1678                 vmcs12->host_rsp = evmcs->host_rsp;
1679         }
1680
1681         if (unlikely(!(evmcs->hv_clean_fields &
1682                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
1683                 vmcs12->ept_pointer = evmcs->ept_pointer;
1684                 vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
1685         }
1686
1687         if (unlikely(!(evmcs->hv_clean_fields &
1688                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
1689                 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
1690                 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
1691                 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
1692                 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
1693                 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
1694                 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
1695                 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
1696                 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
1697                 vmcs12->guest_pending_dbg_exceptions =
1698                         evmcs->guest_pending_dbg_exceptions;
1699                 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
1700                 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
1701                 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
1702                 vmcs12->guest_activity_state = evmcs->guest_activity_state;
1703                 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
1704         }
1705
1706         /*
1707          * Not used?
1708          * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1709          * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1710          * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
1711          * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
1712          * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
1713          * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
1714          * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
1715          * vmcs12->page_fault_error_code_mask =
1716          *              evmcs->page_fault_error_code_mask;
1717          * vmcs12->page_fault_error_code_match =
1718          *              evmcs->page_fault_error_code_match;
1719          * vmcs12->cr3_target_count = evmcs->cr3_target_count;
1720          * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1721          * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1722          * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1723          */
1724
1725         /*
1726          * Read only fields:
1727          * vmcs12->guest_physical_address = evmcs->guest_physical_address;
1728          * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1729          * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1730          * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1731          * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1732          * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1733          * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1734          * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1735          * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1736          * vmcs12->exit_qualification = evmcs->exit_qualification;
1737          * vmcs12->guest_linear_address = evmcs->guest_linear_address;
1738          *
1739          * Not present in struct vmcs12:
1740          * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1741          * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1742          * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1743          * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1744          */
1745
1746         return 0;
1747 }
1748
1749 static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
1750 {
1751         struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1752         struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1753
1754         /*
1755          * Should not be changed by KVM:
1756          *
1757          * evmcs->host_es_selector = vmcs12->host_es_selector;
1758          * evmcs->host_cs_selector = vmcs12->host_cs_selector;
1759          * evmcs->host_ss_selector = vmcs12->host_ss_selector;
1760          * evmcs->host_ds_selector = vmcs12->host_ds_selector;
1761          * evmcs->host_fs_selector = vmcs12->host_fs_selector;
1762          * evmcs->host_gs_selector = vmcs12->host_gs_selector;
1763          * evmcs->host_tr_selector = vmcs12->host_tr_selector;
1764          * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1765          * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1766          * evmcs->host_cr0 = vmcs12->host_cr0;
1767          * evmcs->host_cr3 = vmcs12->host_cr3;
1768          * evmcs->host_cr4 = vmcs12->host_cr4;
1769          * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1770          * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1771          * evmcs->host_rip = vmcs12->host_rip;
1772          * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1773          * evmcs->host_fs_base = vmcs12->host_fs_base;
1774          * evmcs->host_gs_base = vmcs12->host_gs_base;
1775          * evmcs->host_tr_base = vmcs12->host_tr_base;
1776          * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1777          * evmcs->host_idtr_base = vmcs12->host_idtr_base;
1778          * evmcs->host_rsp = vmcs12->host_rsp;
1779          * sync_vmcs02_to_vmcs12() doesn't read these:
1780          * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1781          * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1782          * evmcs->msr_bitmap = vmcs12->msr_bitmap;
1783          * evmcs->ept_pointer = vmcs12->ept_pointer;
1784          * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1785          * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1786          * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1787          * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
1788          * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
1789          * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
1790          * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
1791          * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
1792          * evmcs->tpr_threshold = vmcs12->tpr_threshold;
1793          * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1794          * evmcs->exception_bitmap = vmcs12->exception_bitmap;
1795          * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1796          * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1797          * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1798          * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1799          * evmcs->page_fault_error_code_mask =
1800          *              vmcs12->page_fault_error_code_mask;
1801          * evmcs->page_fault_error_code_match =
1802          *              vmcs12->page_fault_error_code_match;
1803          * evmcs->cr3_target_count = vmcs12->cr3_target_count;
1804          * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1805          * evmcs->tsc_offset = vmcs12->tsc_offset;
1806          * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1807          * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1808          * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1809          * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1810          * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
1811          * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
1812          * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
1813          * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
1814          *
1815          * Not present in struct vmcs12:
1816          * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
1817          * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
1818          * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
1819          * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
1820          */
1821
1822         evmcs->guest_es_selector = vmcs12->guest_es_selector;
1823         evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
1824         evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
1825         evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
1826         evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
1827         evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
1828         evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
1829         evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
1830
1831         evmcs->guest_es_limit = vmcs12->guest_es_limit;
1832         evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
1833         evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
1834         evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
1835         evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
1836         evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
1837         evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
1838         evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
1839         evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
1840         evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
1841
1842         evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
1843         evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
1844         evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
1845         evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
1846         evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
1847         evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
1848         evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
1849         evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
1850
1851         evmcs->guest_es_base = vmcs12->guest_es_base;
1852         evmcs->guest_cs_base = vmcs12->guest_cs_base;
1853         evmcs->guest_ss_base = vmcs12->guest_ss_base;
1854         evmcs->guest_ds_base = vmcs12->guest_ds_base;
1855         evmcs->guest_fs_base = vmcs12->guest_fs_base;
1856         evmcs->guest_gs_base = vmcs12->guest_gs_base;
1857         evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
1858         evmcs->guest_tr_base = vmcs12->guest_tr_base;
1859         evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
1860         evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
1861
1862         evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
1863         evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
1864
1865         evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
1866         evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
1867         evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
1868         evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
1869
1870         evmcs->guest_pending_dbg_exceptions =
1871                 vmcs12->guest_pending_dbg_exceptions;
1872         evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
1873         evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
1874
1875         evmcs->guest_activity_state = vmcs12->guest_activity_state;
1876         evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
1877
1878         evmcs->guest_cr0 = vmcs12->guest_cr0;
1879         evmcs->guest_cr3 = vmcs12->guest_cr3;
1880         evmcs->guest_cr4 = vmcs12->guest_cr4;
1881         evmcs->guest_dr7 = vmcs12->guest_dr7;
1882
1883         evmcs->guest_physical_address = vmcs12->guest_physical_address;
1884
1885         evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
1886         evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
1887         evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
1888         evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
1889         evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
1890         evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
1891         evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
1892         evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
1893
1894         evmcs->exit_qualification = vmcs12->exit_qualification;
1895
1896         evmcs->guest_linear_address = vmcs12->guest_linear_address;
1897         evmcs->guest_rsp = vmcs12->guest_rsp;
1898         evmcs->guest_rflags = vmcs12->guest_rflags;
1899
1900         evmcs->guest_interruptibility_info =
1901                 vmcs12->guest_interruptibility_info;
1902         evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
1903         evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
1904         evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
1905         evmcs->vm_entry_exception_error_code =
1906                 vmcs12->vm_entry_exception_error_code;
1907         evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
1908
1909         evmcs->guest_rip = vmcs12->guest_rip;
1910
1911         evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
1912
1913         return 0;
1914 }
1915
1916 /*
1917  * This is an equivalent of the nested hypervisor executing the vmptrld
1918  * instruction.
1919  */
1920 static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
1921                                                  bool from_launch)
1922 {
1923         struct vcpu_vmx *vmx = to_vmx(vcpu);
1924         bool evmcs_gpa_changed = false;
1925         u64 evmcs_gpa;
1926
1927         if (likely(!vmx->nested.enlightened_vmcs_enabled))
1928                 return 1;
1929
1930         if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
1931                 return 1;
1932
1933         if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
1934                 if (!vmx->nested.hv_evmcs)
1935                         vmx->nested.current_vmptr = -1ull;
1936
1937                 nested_release_evmcs(vcpu);
1938
1939                 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
1940                                  &vmx->nested.hv_evmcs_map))
1941                         return 0;
1942
1943                 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
1944
1945                 /*
1946                  * Currently, KVM only supports eVMCS version 1
1947                  * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
1948                  * value to first u32 field of eVMCS which should specify eVMCS
1949                  * VersionNumber.
1950                  *
1951                  * Guest should be aware of supported eVMCS versions by host by
1952                  * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
1953                  * expected to set this CPUID leaf according to the value
1954                  * returned in vmcs_version from nested_enable_evmcs().
1955                  *
1956                  * However, it turns out that Microsoft Hyper-V fails to comply
1957                  * to their own invented interface: When Hyper-V use eVMCS, it
1958                  * just sets first u32 field of eVMCS to revision_id specified
1959                  * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
1960                  * which is one of the supported versions specified in
1961                  * CPUID.0x4000000A.EAX[0:15].
1962                  *
1963                  * To overcome Hyper-V bug, we accept here either a supported
1964                  * eVMCS version or VMCS12 revision_id as valid values for first
1965                  * u32 field of eVMCS.
1966                  */
1967                 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
1968                     (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
1969                         nested_release_evmcs(vcpu);
1970                         return 0;
1971                 }
1972
1973                 vmx->nested.dirty_vmcs12 = true;
1974                 vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
1975
1976                 evmcs_gpa_changed = true;
1977                 /*
1978                  * Unlike normal vmcs12, enlightened vmcs12 is not fully
1979                  * reloaded from guest's memory (read only fields, fields not
1980                  * present in struct hv_enlightened_vmcs, ...). Make sure there
1981                  * are no leftovers.
1982                  */
1983                 if (from_launch) {
1984                         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1985                         memset(vmcs12, 0, sizeof(*vmcs12));
1986                         vmcs12->hdr.revision_id = VMCS12_REVISION;
1987                 }
1988
1989         }
1990
1991         /*
1992          * Clean fields data can't de used on VMLAUNCH and when we switch
1993          * between different L2 guests as KVM keeps a single VMCS12 per L1.
1994          */
1995         if (from_launch || evmcs_gpa_changed)
1996                 vmx->nested.hv_evmcs->hv_clean_fields &=
1997                         ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
1998
1999         return 1;
2000 }
2001
2002 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
2003 {
2004         struct vcpu_vmx *vmx = to_vmx(vcpu);
2005
2006         /*
2007          * hv_evmcs may end up being not mapped after migration (when
2008          * L2 was running), map it here to make sure vmcs12 changes are
2009          * properly reflected.
2010          */
2011         if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs)
2012                 nested_vmx_handle_enlightened_vmptrld(vcpu, false);
2013
2014         if (vmx->nested.hv_evmcs) {
2015                 copy_vmcs12_to_enlightened(vmx);
2016                 /* All fields are clean */
2017                 vmx->nested.hv_evmcs->hv_clean_fields |=
2018                         HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2019         } else {
2020                 copy_vmcs12_to_shadow(vmx);
2021         }
2022
2023         vmx->nested.need_vmcs12_to_shadow_sync = false;
2024 }
2025
2026 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
2027 {
2028         struct vcpu_vmx *vmx =
2029                 container_of(timer, struct vcpu_vmx, nested.preemption_timer);
2030
2031         vmx->nested.preemption_timer_expired = true;
2032         kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
2033         kvm_vcpu_kick(&vmx->vcpu);
2034
2035         return HRTIMER_NORESTART;
2036 }
2037
2038 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
2039 {
2040         u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
2041         struct vcpu_vmx *vmx = to_vmx(vcpu);
2042
2043         /*
2044          * A timer value of zero is architecturally guaranteed to cause
2045          * a VMExit prior to executing any instructions in the guest.
2046          */
2047         if (preemption_timeout == 0) {
2048                 vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
2049                 return;
2050         }
2051
2052         if (vcpu->arch.virtual_tsc_khz == 0)
2053                 return;
2054
2055         preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2056         preemption_timeout *= 1000000;
2057         do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
2058         hrtimer_start(&vmx->nested.preemption_timer,
2059                       ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
2060 }
2061
2062 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2063 {
2064         if (vmx->nested.nested_run_pending &&
2065             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
2066                 return vmcs12->guest_ia32_efer;
2067         else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
2068                 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
2069         else
2070                 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
2071 }
2072
2073 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
2074 {
2075         /*
2076          * If vmcs02 hasn't been initialized, set the constant vmcs02 state
2077          * according to L0's settings (vmcs12 is irrelevant here).  Host
2078          * fields that come from L0 and are not constant, e.g. HOST_CR3,
2079          * will be set as needed prior to VMLAUNCH/VMRESUME.
2080          */
2081         if (vmx->nested.vmcs02_initialized)
2082                 return;
2083         vmx->nested.vmcs02_initialized = true;
2084
2085         /*
2086          * We don't care what the EPTP value is we just need to guarantee
2087          * it's valid so we don't get a false positive when doing early
2088          * consistency checks.
2089          */
2090         if (enable_ept && nested_early_check)
2091                 vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
2092
2093         /* All VMFUNCs are currently emulated through L0 vmexits.  */
2094         if (cpu_has_vmx_vmfunc())
2095                 vmcs_write64(VM_FUNCTION_CONTROL, 0);
2096
2097         if (cpu_has_vmx_posted_intr())
2098                 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
2099
2100         if (cpu_has_vmx_msr_bitmap())
2101                 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
2102
2103         /*
2104          * The PML address never changes, so it is constant in vmcs02.
2105          * Conceptually we want to copy the PML index from vmcs01 here,
2106          * and then back to vmcs01 on nested vmexit.  But since we flush
2107          * the log and reset GUEST_PML_INDEX on each vmexit, the PML
2108          * index is also effectively constant in vmcs02.
2109          */
2110         if (enable_pml) {
2111                 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
2112                 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
2113         }
2114
2115         if (cpu_has_vmx_encls_vmexit())
2116                 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
2117
2118         /*
2119          * Set the MSR load/store lists to match L0's settings.  Only the
2120          * addresses are constant (for vmcs02), the counts can change based
2121          * on L2's behavior, e.g. switching to/from long mode.
2122          */
2123         vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
2124         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2125         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
2126
2127         vmx_set_constant_host_state(vmx);
2128 }
2129
2130 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
2131                                       struct vmcs12 *vmcs12)
2132 {
2133         prepare_vmcs02_constant_state(vmx);
2134
2135         vmcs_write64(VMCS_LINK_POINTER, -1ull);
2136
2137         if (enable_vpid) {
2138                 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
2139                         vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
2140                 else
2141                         vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2142         }
2143 }
2144
2145 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2146 {
2147         u32 exec_control, vmcs12_exec_ctrl;
2148         u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
2149
2150         if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
2151                 prepare_vmcs02_early_rare(vmx, vmcs12);
2152
2153         /*
2154          * PIN CONTROLS
2155          */
2156         exec_control = vmx_pin_based_exec_ctrl(vmx);
2157         exec_control |= (vmcs12->pin_based_vm_exec_control &
2158                          ~PIN_BASED_VMX_PREEMPTION_TIMER);
2159
2160         /* Posted interrupts setting is only taken from vmcs12.  */
2161         if (nested_cpu_has_posted_intr(vmcs12)) {
2162                 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
2163                 vmx->nested.pi_pending = false;
2164         } else {
2165                 exec_control &= ~PIN_BASED_POSTED_INTR;
2166         }
2167         pin_controls_set(vmx, exec_control);
2168
2169         /*
2170          * EXEC CONTROLS
2171          */
2172         exec_control = vmx_exec_control(vmx); /* L0's desires */
2173         exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2174         exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2175         exec_control &= ~CPU_BASED_TPR_SHADOW;
2176         exec_control |= vmcs12->cpu_based_vm_exec_control;
2177
2178         vmx->nested.l1_tpr_threshold = -1;
2179         if (exec_control & CPU_BASED_TPR_SHADOW)
2180                 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
2181 #ifdef CONFIG_X86_64
2182         else
2183                 exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2184                                 CPU_BASED_CR8_STORE_EXITING;
2185 #endif
2186
2187         /*
2188          * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2189          * for I/O port accesses.
2190          */
2191         exec_control |= CPU_BASED_UNCOND_IO_EXITING;
2192         exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2193
2194         /*
2195          * This bit will be computed in nested_get_vmcs12_pages, because
2196          * we do not have access to L1's MSR bitmap yet.  For now, keep
2197          * the same bit as before, hoping to avoid multiple VMWRITEs that
2198          * only set/clear this bit.
2199          */
2200         exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
2201         exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
2202
2203         exec_controls_set(vmx, exec_control);
2204
2205         /*
2206          * SECONDARY EXEC CONTROLS
2207          */
2208         if (cpu_has_secondary_exec_ctrls()) {
2209                 exec_control = vmx->secondary_exec_control;
2210
2211                 /* Take the following fields only from vmcs12 */
2212                 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2213                                   SECONDARY_EXEC_ENABLE_INVPCID |
2214                                   SECONDARY_EXEC_RDTSCP |
2215                                   SECONDARY_EXEC_XSAVES |
2216                                   SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
2217                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2218                                   SECONDARY_EXEC_APIC_REGISTER_VIRT |
2219                                   SECONDARY_EXEC_ENABLE_VMFUNC);
2220                 if (nested_cpu_has(vmcs12,
2221                                    CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
2222                         vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
2223                                 ~SECONDARY_EXEC_ENABLE_PML;
2224                         exec_control |= vmcs12_exec_ctrl;
2225                 }
2226
2227                 /* VMCS shadowing for L2 is emulated for now */
2228                 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2229
2230                 /*
2231                  * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
2232                  * will not have to rewrite the controls just for this bit.
2233                  */
2234                 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
2235                     (vmcs12->guest_cr4 & X86_CR4_UMIP))
2236                         exec_control |= SECONDARY_EXEC_DESC;
2237
2238                 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2239                         vmcs_write16(GUEST_INTR_STATUS,
2240                                 vmcs12->guest_intr_status);
2241
2242                 secondary_exec_controls_set(vmx, exec_control);
2243         }
2244
2245         /*
2246          * ENTRY CONTROLS
2247          *
2248          * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2249          * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2250          * on the related bits (if supported by the CPU) in the hope that
2251          * we can avoid VMWrites during vmx_set_efer().
2252          */
2253         exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) &
2254                         ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
2255         if (cpu_has_load_ia32_efer()) {
2256                 if (guest_efer & EFER_LMA)
2257                         exec_control |= VM_ENTRY_IA32E_MODE;
2258                 if (guest_efer != host_efer)
2259                         exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2260         }
2261         vm_entry_controls_set(vmx, exec_control);
2262
2263         /*
2264          * EXIT CONTROLS
2265          *
2266          * L2->L1 exit controls are emulated - the hardware exit is to L0 so
2267          * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2268          * bits may be modified by vmx_set_efer() in prepare_vmcs02().
2269          */
2270         exec_control = vmx_vmexit_ctrl();
2271         if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
2272                 exec_control |= VM_EXIT_LOAD_IA32_EFER;
2273         vm_exit_controls_set(vmx, exec_control);
2274
2275         /*
2276          * Interrupt/Exception Fields
2277          */
2278         if (vmx->nested.nested_run_pending) {
2279                 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2280                              vmcs12->vm_entry_intr_info_field);
2281                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2282                              vmcs12->vm_entry_exception_error_code);
2283                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2284                              vmcs12->vm_entry_instruction_len);
2285                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2286                              vmcs12->guest_interruptibility_info);
2287                 vmx->loaded_vmcs->nmi_known_unmasked =
2288                         !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
2289         } else {
2290                 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
2291         }
2292 }
2293
2294 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2295 {
2296         struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2297
2298         if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2299                            HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2300                 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
2301                 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
2302                 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
2303                 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
2304                 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
2305                 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
2306                 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
2307                 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
2308                 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
2309                 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
2310                 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
2311                 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
2312                 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
2313                 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
2314                 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
2315                 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2316                 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2317                 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
2318                 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2319                 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
2320                 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2321                 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2322                 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
2323                 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
2324                 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
2325                 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
2326                 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
2327                 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
2328                 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
2329                 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
2330                 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
2331                 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
2332                 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
2333                 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
2334                 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
2335                 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
2336         }
2337
2338         if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2339                            HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
2340                 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
2341                 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
2342                             vmcs12->guest_pending_dbg_exceptions);
2343                 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
2344                 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
2345
2346                 /*
2347                  * L1 may access the L2's PDPTR, so save them to construct
2348                  * vmcs12
2349                  */
2350                 if (enable_ept) {
2351                         vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2352                         vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2353                         vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2354                         vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2355                 }
2356
2357                 if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
2358                     (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2359                         vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
2360         }
2361
2362         if (nested_cpu_has_xsaves(vmcs12))
2363                 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
2364
2365         /*
2366          * Whether page-faults are trapped is determined by a combination of
2367          * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
2368          * If enable_ept, L0 doesn't care about page faults and we should
2369          * set all of these to L1's desires. However, if !enable_ept, L0 does
2370          * care about (at least some) page faults, and because it is not easy
2371          * (if at all possible?) to merge L0 and L1's desires, we simply ask
2372          * to exit on each and every L2 page fault. This is done by setting
2373          * MASK=MATCH=0 and (see below) EB.PF=1.
2374          * Note that below we don't need special code to set EB.PF beyond the
2375          * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2376          * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2377          * !enable_ept, EB.PF is 1, so the "or" will always be 1.
2378          */
2379         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
2380                 enable_ept ? vmcs12->page_fault_error_code_mask : 0);
2381         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
2382                 enable_ept ? vmcs12->page_fault_error_code_match : 0);
2383
2384         if (cpu_has_vmx_apicv()) {
2385                 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
2386                 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
2387                 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
2388                 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
2389         }
2390
2391         /*
2392          * Make sure the msr_autostore list is up to date before we set the
2393          * count in the vmcs02.
2394          */
2395         prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
2396
2397         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
2398         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2399         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2400
2401         set_cr4_guest_host_mask(vmx);
2402 }
2403
2404 /*
2405  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2406  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2407  * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2408  * guest in a way that will both be appropriate to L1's requests, and our
2409  * needs. In addition to modifying the active vmcs (which is vmcs02), this
2410  * function also has additional necessary side-effects, like setting various
2411  * vcpu->arch fields.
2412  * Returns 0 on success, 1 on failure. Invalid state exit qualification code
2413  * is assigned to entry_failure_code on failure.
2414  */
2415 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2416                           u32 *entry_failure_code)
2417 {
2418         struct vcpu_vmx *vmx = to_vmx(vcpu);
2419         struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2420         bool load_guest_pdptrs_vmcs12 = false;
2421
2422         if (vmx->nested.dirty_vmcs12 || hv_evmcs) {
2423                 prepare_vmcs02_rare(vmx, vmcs12);
2424                 vmx->nested.dirty_vmcs12 = false;
2425
2426                 load_guest_pdptrs_vmcs12 = !hv_evmcs ||
2427                         !(hv_evmcs->hv_clean_fields &
2428                           HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
2429         }
2430
2431         if (vmx->nested.nested_run_pending &&
2432             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2433                 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
2434                 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
2435         } else {
2436                 kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2437                 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
2438         }
2439         if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
2440             !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
2441                 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
2442         vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2443
2444         /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2445          * bitwise-or of what L1 wants to trap for L2, and what we want to
2446          * trap. Note that CR0.TS also needs updating - we do this later.
2447          */
2448         update_exception_bitmap(vcpu);
2449         vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
2450         vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2451
2452         if (vmx->nested.nested_run_pending &&
2453             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
2454                 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
2455                 vcpu->arch.pat = vmcs12->guest_ia32_pat;
2456         } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2457                 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
2458         }
2459
2460         vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2461
2462         if (kvm_has_tsc_control)
2463                 decache_tsc_multiplier(vmx);
2464
2465         if (enable_vpid) {
2466                 /*
2467                  * There is no direct mapping between vpid02 and vpid12, the
2468                  * vpid02 is per-vCPU for L0 and reused while the value of
2469                  * vpid12 is changed w/ one invvpid during nested vmentry.
2470                  * The vpid12 is allocated by L1 for L2, so it will not
2471                  * influence global bitmap(for vpid01 and vpid02 allocation)
2472                  * even if spawn a lot of nested vCPUs.
2473                  */
2474                 if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
2475                         if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
2476                                 vmx->nested.last_vpid = vmcs12->virtual_processor_id;
2477                                 __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
2478                         }
2479                 } else {
2480                         /*
2481                          * If L1 use EPT, then L0 needs to execute INVEPT on
2482                          * EPTP02 instead of EPTP01. Therefore, delay TLB
2483                          * flush until vmcs02->eptp is fully updated by
2484                          * KVM_REQ_LOAD_CR3. Note that this assumes
2485                          * KVM_REQ_TLB_FLUSH is evaluated after
2486                          * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
2487                          */
2488                         kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2489                 }
2490         }
2491
2492         if (nested_cpu_has_ept(vmcs12))
2493                 nested_ept_init_mmu_context(vcpu);
2494         else if (nested_cpu_has2(vmcs12,
2495                                  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2496                 vmx_flush_tlb(vcpu, true);
2497
2498         /*
2499          * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
2500          * bits which we consider mandatory enabled.
2501          * The CR0_READ_SHADOW is what L2 should have expected to read given
2502          * the specifications by L1; It's not enough to take
2503          * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
2504          * have more bits than L1 expected.
2505          */
2506         vmx_set_cr0(vcpu, vmcs12->guest_cr0);
2507         vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2508
2509         vmx_set_cr4(vcpu, vmcs12->guest_cr4);
2510         vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
2511
2512         vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
2513         /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2514         vmx_set_efer(vcpu, vcpu->arch.efer);
2515
2516         /*
2517          * Guest state is invalid and unrestricted guest is disabled,
2518          * which means L1 attempted VMEntry to L2 with invalid state.
2519          * Fail the VMEntry.
2520          */
2521         if (vmx->emulation_required) {
2522                 *entry_failure_code = ENTRY_FAIL_DEFAULT;
2523                 return -EINVAL;
2524         }
2525
2526         /* Shadow page tables on either EPT or shadow page tables. */
2527         if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
2528                                 entry_failure_code))
2529                 return -EINVAL;
2530
2531         /*
2532          * Immediately write vmcs02.GUEST_CR3.  It will be propagated to vmcs12
2533          * on nested VM-Exit, which can occur without actually running L2 and
2534          * thus without hitting vmx_set_cr3(), e.g. if L1 is entering L2 with
2535          * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
2536          * transition to HLT instead of running L2.
2537          */
2538         if (enable_ept)
2539                 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
2540
2541         /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
2542         if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
2543             is_pae_paging(vcpu)) {
2544                 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2545                 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2546                 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2547                 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2548         }
2549
2550         if (!enable_ept)
2551                 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
2552
2553         if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2554             SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
2555                             vmcs12->guest_ia32_perf_global_ctrl))
2556                 return -EINVAL;
2557
2558         kvm_rsp_write(vcpu, vmcs12->guest_rsp);
2559         kvm_rip_write(vcpu, vmcs12->guest_rip);
2560         return 0;
2561 }
2562
2563 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
2564 {
2565         if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
2566                nested_cpu_has_virtual_nmis(vmcs12)))
2567                 return -EINVAL;
2568
2569         if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
2570                nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING)))
2571                 return -EINVAL;
2572
2573         return 0;
2574 }
2575
2576 static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
2577 {
2578         struct vcpu_vmx *vmx = to_vmx(vcpu);
2579         int maxphyaddr = cpuid_maxphyaddr(vcpu);
2580
2581         /* Check for memory type validity */
2582         switch (address & VMX_EPTP_MT_MASK) {
2583         case VMX_EPTP_MT_UC:
2584                 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
2585                         return false;
2586                 break;
2587         case VMX_EPTP_MT_WB:
2588                 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
2589                         return false;
2590                 break;
2591         default:
2592                 return false;
2593         }
2594
2595         /* only 4 levels page-walk length are valid */
2596         if (CC((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4))
2597                 return false;
2598
2599         /* Reserved bits should not be set */
2600         if (CC(address >> maxphyaddr || ((address >> 7) & 0x1f)))
2601                 return false;
2602
2603         /* AD, if set, should be supported */
2604         if (address & VMX_EPTP_AD_ENABLE_BIT) {
2605                 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
2606                         return false;
2607         }
2608
2609         return true;
2610 }
2611
2612 /*
2613  * Checks related to VM-Execution Control Fields
2614  */
2615 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
2616                                               struct vmcs12 *vmcs12)
2617 {
2618         struct vcpu_vmx *vmx = to_vmx(vcpu);
2619
2620         if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
2621                                    vmx->nested.msrs.pinbased_ctls_low,
2622                                    vmx->nested.msrs.pinbased_ctls_high)) ||
2623             CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
2624                                    vmx->nested.msrs.procbased_ctls_low,
2625                                    vmx->nested.msrs.procbased_ctls_high)))
2626                 return -EINVAL;
2627
2628         if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
2629             CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
2630                                    vmx->nested.msrs.secondary_ctls_low,
2631                                    vmx->nested.msrs.secondary_ctls_high)))
2632                 return -EINVAL;
2633
2634         if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
2635             nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
2636             nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
2637             nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
2638             nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
2639             nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
2640             nested_vmx_check_nmi_controls(vmcs12) ||
2641             nested_vmx_check_pml_controls(vcpu, vmcs12) ||
2642             nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
2643             nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
2644             nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
2645             CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
2646                 return -EINVAL;
2647
2648         if (!nested_cpu_has_preemption_timer(vmcs12) &&
2649             nested_cpu_has_save_preemption_timer(vmcs12))
2650                 return -EINVAL;
2651
2652         if (nested_cpu_has_ept(vmcs12) &&
2653             CC(!valid_ept_address(vcpu, vmcs12->ept_pointer)))
2654                 return -EINVAL;
2655
2656         if (nested_cpu_has_vmfunc(vmcs12)) {
2657                 if (CC(vmcs12->vm_function_control &
2658                        ~vmx->nested.msrs.vmfunc_controls))
2659                         return -EINVAL;
2660
2661                 if (nested_cpu_has_eptp_switching(vmcs12)) {
2662                         if (CC(!nested_cpu_has_ept(vmcs12)) ||
2663                             CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
2664                                 return -EINVAL;
2665                 }
2666         }
2667
2668         return 0;
2669 }
2670
2671 /*
2672  * Checks related to VM-Exit Control Fields
2673  */
2674 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
2675                                          struct vmcs12 *vmcs12)
2676 {
2677         struct vcpu_vmx *vmx = to_vmx(vcpu);
2678
2679         if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
2680                                     vmx->nested.msrs.exit_ctls_low,
2681                                     vmx->nested.msrs.exit_ctls_high)) ||
2682             CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
2683                 return -EINVAL;
2684
2685         return 0;
2686 }
2687
2688 /*
2689  * Checks related to VM-Entry Control Fields
2690  */
2691 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2692                                           struct vmcs12 *vmcs12)
2693 {
2694         struct vcpu_vmx *vmx = to_vmx(vcpu);
2695
2696         if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
2697                                     vmx->nested.msrs.entry_ctls_low,
2698                                     vmx->nested.msrs.entry_ctls_high)))
2699                 return -EINVAL;
2700
2701         /*
2702          * From the Intel SDM, volume 3:
2703          * Fields relevant to VM-entry event injection must be set properly.
2704          * These fields are the VM-entry interruption-information field, the
2705          * VM-entry exception error code, and the VM-entry instruction length.
2706          */
2707         if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
2708                 u32 intr_info = vmcs12->vm_entry_intr_info_field;
2709                 u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
2710                 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
2711                 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
2712                 bool should_have_error_code;
2713                 bool urg = nested_cpu_has2(vmcs12,
2714                                            SECONDARY_EXEC_UNRESTRICTED_GUEST);
2715                 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
2716
2717                 /* VM-entry interruption-info field: interruption type */
2718                 if (CC(intr_type == INTR_TYPE_RESERVED) ||
2719                     CC(intr_type == INTR_TYPE_OTHER_EVENT &&
2720                        !nested_cpu_supports_monitor_trap_flag(vcpu)))
2721                         return -EINVAL;
2722
2723                 /* VM-entry interruption-info field: vector */
2724                 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
2725                     CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
2726                     CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
2727                         return -EINVAL;
2728
2729                 /* VM-entry interruption-info field: deliver error code */
2730                 should_have_error_code =
2731                         intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
2732                         x86_exception_has_error_code(vector);
2733                 if (CC(has_error_code != should_have_error_code))
2734                         return -EINVAL;
2735
2736                 /* VM-entry exception error code */
2737                 if (CC(has_error_code &&
2738                        vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
2739                         return -EINVAL;
2740
2741                 /* VM-entry interruption-info field: reserved bits */
2742                 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
2743                         return -EINVAL;
2744
2745                 /* VM-entry instruction length */
2746                 switch (intr_type) {
2747                 case INTR_TYPE_SOFT_EXCEPTION:
2748                 case INTR_TYPE_SOFT_INTR:
2749                 case INTR_TYPE_PRIV_SW_EXCEPTION:
2750                         if (CC(vmcs12->vm_entry_instruction_len > 15) ||
2751                             CC(vmcs12->vm_entry_instruction_len == 0 &&
2752                             CC(!nested_cpu_has_zero_length_injection(vcpu))))
2753                                 return -EINVAL;
2754                 }
2755         }
2756
2757         if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
2758                 return -EINVAL;
2759
2760         return 0;
2761 }
2762
2763 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
2764                                      struct vmcs12 *vmcs12)
2765 {
2766         if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
2767             nested_check_vm_exit_controls(vcpu, vmcs12) ||
2768             nested_check_vm_entry_controls(vcpu, vmcs12))
2769                 return -EINVAL;
2770
2771         return 0;
2772 }
2773
2774 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
2775                                        struct vmcs12 *vmcs12)
2776 {
2777         bool ia32e;
2778
2779         if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
2780             CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
2781             CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3)))
2782                 return -EINVAL;
2783
2784         if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
2785             CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
2786                 return -EINVAL;
2787
2788         if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
2789             CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
2790                 return -EINVAL;
2791
2792         if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2793             CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
2794                                            vmcs12->host_ia32_perf_global_ctrl)))
2795                 return -EINVAL;
2796
2797 #ifdef CONFIG_X86_64
2798         ia32e = !!(vcpu->arch.efer & EFER_LMA);
2799 #else
2800         ia32e = false;
2801 #endif
2802
2803         if (ia32e) {
2804                 if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) ||
2805                     CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
2806                         return -EINVAL;
2807         } else {
2808                 if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) ||
2809                     CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
2810                     CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
2811                     CC((vmcs12->host_rip) >> 32))
2812                         return -EINVAL;
2813         }
2814
2815         if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2816             CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2817             CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2818             CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2819             CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2820             CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2821             CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2822             CC(vmcs12->host_cs_selector == 0) ||
2823             CC(vmcs12->host_tr_selector == 0) ||
2824             CC(vmcs12->host_ss_selector == 0 && !ia32e))
2825                 return -EINVAL;
2826
2827 #ifdef CONFIG_X86_64
2828         if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
2829             CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
2830             CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
2831             CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
2832             CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
2833             CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
2834                 return -EINVAL;
2835 #endif
2836
2837         /*
2838          * If the load IA32_EFER VM-exit control is 1, bits reserved in the
2839          * IA32_EFER MSR must be 0 in the field for that register. In addition,
2840          * the values of the LMA and LME bits in the field must each be that of
2841          * the host address-space size VM-exit control.
2842          */
2843         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
2844                 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
2845                     CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
2846                     CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
2847                         return -EINVAL;
2848         }
2849
2850         return 0;
2851 }
2852
2853 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
2854                                           struct vmcs12 *vmcs12)
2855 {
2856         int r = 0;
2857         struct vmcs12 *shadow;
2858         struct kvm_host_map map;
2859
2860         if (vmcs12->vmcs_link_pointer == -1ull)
2861                 return 0;
2862
2863         if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
2864                 return -EINVAL;
2865
2866         if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)))
2867                 return -EINVAL;
2868
2869         shadow = map.hva;
2870
2871         if (CC(shadow->hdr.revision_id != VMCS12_REVISION) ||
2872             CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
2873                 r = -EINVAL;
2874
2875         kvm_vcpu_unmap(vcpu, &map, false);
2876         return r;
2877 }
2878
2879 /*
2880  * Checks related to Guest Non-register State
2881  */
2882 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
2883 {
2884         if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
2885                vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT))
2886                 return -EINVAL;
2887
2888         return 0;
2889 }
2890
2891 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
2892                                         struct vmcs12 *vmcs12,
2893                                         u32 *exit_qual)
2894 {
2895         bool ia32e;
2896
2897         *exit_qual = ENTRY_FAIL_DEFAULT;
2898
2899         if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
2900             CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
2901                 return -EINVAL;
2902
2903         if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
2904             CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
2905                 return -EINVAL;
2906
2907         if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
2908                 *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
2909                 return -EINVAL;
2910         }
2911
2912         if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2913             CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
2914                                            vmcs12->guest_ia32_perf_global_ctrl)))
2915                 return -EINVAL;
2916
2917         /*
2918          * If the load IA32_EFER VM-entry control is 1, the following checks
2919          * are performed on the field for the IA32_EFER MSR:
2920          * - Bits reserved in the IA32_EFER MSR must be 0.
2921          * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
2922          *   the IA-32e mode guest VM-exit control. It must also be identical
2923          *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
2924          *   CR0.PG) is 1.
2925          */
2926         if (to_vmx(vcpu)->nested.nested_run_pending &&
2927             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
2928                 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
2929                 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
2930                     CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
2931                     CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
2932                      ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
2933                         return -EINVAL;
2934         }
2935
2936         if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
2937             (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
2938              CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
2939                 return -EINVAL;
2940
2941         if (nested_check_guest_non_reg_state(vmcs12))
2942                 return -EINVAL;
2943
2944         return 0;
2945 }
2946
2947 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2948 {
2949         struct vcpu_vmx *vmx = to_vmx(vcpu);
2950         unsigned long cr3, cr4;
2951         bool vm_fail;
2952
2953         if (!nested_early_check)
2954                 return 0;
2955
2956         if (vmx->msr_autoload.host.nr)
2957                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2958         if (vmx->msr_autoload.guest.nr)
2959                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2960
2961         preempt_disable();
2962
2963         vmx_prepare_switch_to_guest(vcpu);
2964
2965         /*
2966          * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
2967          * which is reserved to '1' by hardware.  GUEST_RFLAGS is guaranteed to
2968          * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
2969          * there is no need to preserve other bits or save/restore the field.
2970          */
2971         vmcs_writel(GUEST_RFLAGS, 0);
2972
2973         cr3 = __get_current_cr3_fast();
2974         if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
2975                 vmcs_writel(HOST_CR3, cr3);
2976                 vmx->loaded_vmcs->host_state.cr3 = cr3;
2977         }
2978
2979         cr4 = cr4_read_shadow();
2980         if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
2981                 vmcs_writel(HOST_CR4, cr4);
2982                 vmx->loaded_vmcs->host_state.cr4 = cr4;
2983         }
2984
2985         asm(
2986                 "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
2987                 "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2988                 "je 1f \n\t"
2989                 __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
2990                 "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2991                 "1: \n\t"
2992                 "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
2993
2994                 /* Check if vmlaunch or vmresume is needed */
2995                 "cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
2996
2997                 /*
2998                  * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
2999                  * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
3000                  * Valid.  vmx_vmenter() directly "returns" RFLAGS, and so the
3001                  * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
3002                  */
3003                 "call vmx_vmenter\n\t"
3004
3005                 CC_SET(be)
3006               : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
3007               : [HOST_RSP]"r"((unsigned long)HOST_RSP),
3008                 [loaded_vmcs]"r"(vmx->loaded_vmcs),
3009                 [launched]"i"(offsetof(struct loaded_vmcs, launched)),
3010                 [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
3011                 [wordsize]"i"(sizeof(ulong))
3012               : "memory"
3013         );
3014
3015         if (vmx->msr_autoload.host.nr)
3016                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
3017         if (vmx->msr_autoload.guest.nr)
3018                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
3019
3020         if (vm_fail) {
3021                 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
3022
3023                 preempt_enable();
3024
3025                 trace_kvm_nested_vmenter_failed(
3026                         "early hardware check VM-instruction error: ", error);
3027                 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3028                 return 1;
3029         }
3030
3031         /*
3032          * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
3033          */
3034         local_irq_enable();
3035         if (hw_breakpoint_active())
3036                 set_debugreg(__this_cpu_read(cpu_dr7), 7);
3037         preempt_enable();
3038
3039         /*
3040          * A non-failing VMEntry means we somehow entered guest mode with
3041          * an illegal RIP, and that's just the tip of the iceberg.  There
3042          * is no telling what memory has been modified or what state has
3043          * been exposed to unknown code.  Hitting this all but guarantees
3044          * a (very critical) hardware issue.
3045          */
3046         WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
3047                 VMX_EXIT_REASONS_FAILED_VMENTRY));
3048
3049         return 0;
3050 }
3051
3052 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
3053                                                  struct vmcs12 *vmcs12);
3054
3055 static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
3056 {
3057         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3058         struct vcpu_vmx *vmx = to_vmx(vcpu);
3059         struct kvm_host_map *map;
3060         struct page *page;
3061         u64 hpa;
3062
3063         if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3064                 /*
3065                  * Translate L1 physical address to host physical
3066                  * address for vmcs02. Keep the page pinned, so this
3067                  * physical address remains valid. We keep a reference
3068                  * to it so we can release it later.
3069                  */
3070                 if (vmx->nested.apic_access_page) { /* shouldn't happen */
3071                         kvm_release_page_dirty(vmx->nested.apic_access_page);
3072                         vmx->nested.apic_access_page = NULL;
3073                 }
3074                 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
3075                 /*
3076                  * If translation failed, no matter: This feature asks
3077                  * to exit when accessing the given address, and if it
3078                  * can never be accessed, this feature won't do
3079                  * anything anyway.
3080                  */
3081                 if (!is_error_page(page)) {
3082                         vmx->nested.apic_access_page = page;
3083                         hpa = page_to_phys(vmx->nested.apic_access_page);
3084                         vmcs_write64(APIC_ACCESS_ADDR, hpa);
3085                 } else {
3086                         secondary_exec_controls_clearbit(vmx,
3087                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
3088                 }
3089         }
3090
3091         if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3092                 map = &vmx->nested.virtual_apic_map;
3093
3094                 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
3095                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
3096                 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
3097                            nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
3098                            !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3099                         /*
3100                          * The processor will never use the TPR shadow, simply
3101                          * clear the bit from the execution control.  Such a
3102                          * configuration is useless, but it happens in tests.
3103                          * For any other configuration, failing the vm entry is
3104                          * _not_ what the processor does but it's basically the
3105                          * only possibility we have.
3106                          */
3107                         exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
3108                 } else {
3109                         /*
3110                          * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
3111                          * force VM-Entry to fail.
3112                          */
3113                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
3114                 }
3115         }
3116
3117         if (nested_cpu_has_posted_intr(vmcs12)) {
3118                 map = &vmx->nested.pi_desc_map;
3119
3120                 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
3121                         vmx->nested.pi_desc =
3122                                 (struct pi_desc *)(((void *)map->hva) +
3123                                 offset_in_page(vmcs12->posted_intr_desc_addr));
3124                         vmcs_write64(POSTED_INTR_DESC_ADDR,
3125                                      pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
3126                 }
3127         }
3128         if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
3129                 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3130         else
3131                 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3132 }
3133
3134 /*
3135  * Intel's VMX Instruction Reference specifies a common set of prerequisites
3136  * for running VMX instructions (except VMXON, whose prerequisites are
3137  * slightly different). It also specifies what exception to inject otherwise.
3138  * Note that many of these exceptions have priority over VM exits, so they
3139  * don't have to be checked again here.
3140  */
3141 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
3142 {
3143         if (!to_vmx(vcpu)->nested.vmxon) {
3144                 kvm_queue_exception(vcpu, UD_VECTOR);
3145                 return 0;
3146         }
3147
3148         if (vmx_get_cpl(vcpu)) {
3149                 kvm_inject_gp(vcpu, 0);
3150                 return 0;
3151         }
3152
3153         return 1;
3154 }
3155
3156 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
3157 {
3158         u8 rvi = vmx_get_rvi();
3159         u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
3160
3161         return ((rvi & 0xf0) > (vppr & 0xf0));
3162 }
3163
3164 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3165                                    struct vmcs12 *vmcs12);
3166
3167 /*
3168  * If from_vmentry is false, this is being called from state restore (either RSM
3169  * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
3170 + *
3171 + * Returns:
3172 + *   0 - success, i.e. proceed with actual VMEnter
3173 + *   1 - consistency check VMExit
3174 + *  -1 - consistency check VMFail
3175  */
3176 int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
3177 {
3178         struct vcpu_vmx *vmx = to_vmx(vcpu);
3179         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3180         bool evaluate_pending_interrupts;
3181         u32 exit_reason = EXIT_REASON_INVALID_STATE;
3182         u32 exit_qual;
3183
3184         evaluate_pending_interrupts = exec_controls_get(vmx) &
3185                 (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
3186         if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
3187                 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
3188
3189         if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
3190                 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
3191         if (kvm_mpx_supported() &&
3192                 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
3193                 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3194
3195         /*
3196          * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3197          * nested early checks are disabled.  In the event of a "late" VM-Fail,
3198          * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3199          * software model to the pre-VMEntry host state.  When EPT is disabled,
3200          * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3201          * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3.  Stuffing
3202          * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3203          * the correct value.  Smashing vmcs01.GUEST_CR3 is safe because nested
3204          * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3205          * guaranteed to be overwritten with a shadow CR3 prior to re-entering
3206          * L1.  Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3207          * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3208          * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3209          * path would need to manually save/restore vmcs01.GUEST_CR3.
3210          */
3211         if (!enable_ept && !nested_early_check)
3212                 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3213
3214         vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
3215
3216         prepare_vmcs02_early(vmx, vmcs12);
3217
3218         if (from_vmentry) {
3219                 nested_get_vmcs12_pages(vcpu);
3220
3221                 if (nested_vmx_check_vmentry_hw(vcpu)) {
3222                         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3223                         return -1;
3224                 }
3225
3226                 if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
3227                         goto vmentry_fail_vmexit;
3228         }
3229
3230         enter_guest_mode(vcpu);
3231         if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
3232                 vcpu->arch.tsc_offset += vmcs12->tsc_offset;
3233
3234         if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
3235                 goto vmentry_fail_vmexit_guest_mode;
3236
3237         if (from_vmentry) {
3238                 exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
3239                 exit_qual = nested_vmx_load_msr(vcpu,
3240                                                 vmcs12->vm_entry_msr_load_addr,
3241                                                 vmcs12->vm_entry_msr_load_count);
3242                 if (exit_qual)
3243                         goto vmentry_fail_vmexit_guest_mode;
3244         } else {
3245                 /*
3246                  * The MMU is not initialized to point at the right entities yet and
3247                  * "get pages" would need to read data from the guest (i.e. we will
3248                  * need to perform gpa to hpa translation). Request a call
3249                  * to nested_get_vmcs12_pages before the next VM-entry.  The MSRs
3250                  * have already been set at vmentry time and should not be reset.
3251                  */
3252                 kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
3253         }
3254
3255         /*
3256          * If L1 had a pending IRQ/NMI until it executed
3257          * VMLAUNCH/VMRESUME which wasn't delivered because it was
3258          * disallowed (e.g. interrupts disabled), L0 needs to
3259          * evaluate if this pending event should cause an exit from L2
3260          * to L1 or delivered directly to L2 (e.g. In case L1 don't
3261          * intercept EXTERNAL_INTERRUPT).
3262          *
3263          * Usually this would be handled by the processor noticing an
3264          * IRQ/NMI window request, or checking RVI during evaluation of
3265          * pending virtual interrupts.  However, this setting was done
3266          * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
3267          * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
3268          */
3269         if (unlikely(evaluate_pending_interrupts))
3270                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3271
3272         /*
3273          * Do not start the preemption timer hrtimer until after we know
3274          * we are successful, so that only nested_vmx_vmexit needs to cancel
3275          * the timer.
3276          */
3277         vmx->nested.preemption_timer_expired = false;
3278         if (nested_cpu_has_preemption_timer(vmcs12))
3279                 vmx_start_preemption_timer(vcpu);
3280
3281         /*
3282          * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3283          * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3284          * returned as far as L1 is concerned. It will only return (and set
3285          * the success flag) when L2 exits (see nested_vmx_vmexit()).
3286          */
3287         return 0;
3288
3289         /*
3290          * A failed consistency check that leads to a VMExit during L1's
3291          * VMEnter to L2 is a variation of a normal VMexit, as explained in
3292          * 26.7 "VM-entry failures during or after loading guest state".
3293          */
3294 vmentry_fail_vmexit_guest_mode:
3295         if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
3296                 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3297         leave_guest_mode(vcpu);
3298
3299 vmentry_fail_vmexit:
3300         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3301
3302         if (!from_vmentry)
3303                 return 1;
3304
3305         load_vmcs12_host_state(vcpu, vmcs12);
3306         vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
3307         vmcs12->exit_qualification = exit_qual;
3308         if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
3309                 vmx->nested.need_vmcs12_to_shadow_sync = true;
3310         return 1;
3311 }
3312
3313 /*
3314  * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3315  * for running an L2 nested guest.
3316  */
3317 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3318 {
3319         struct vmcs12 *vmcs12;
3320         struct vcpu_vmx *vmx = to_vmx(vcpu);
3321         u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
3322         int ret;
3323
3324         if (!nested_vmx_check_permission(vcpu))
3325                 return 1;
3326
3327         if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch))
3328                 return 1;
3329
3330         if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
3331                 return nested_vmx_failInvalid(vcpu);
3332
3333         vmcs12 = get_vmcs12(vcpu);
3334
3335         /*
3336          * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3337          * that there *is* a valid VMCS pointer, RFLAGS.CF is set
3338          * rather than RFLAGS.ZF, and no error number is stored to the
3339          * VM-instruction error field.
3340          */
3341         if (vmcs12->hdr.shadow_vmcs)
3342                 return nested_vmx_failInvalid(vcpu);
3343
3344         if (vmx->nested.hv_evmcs) {
3345                 copy_enlightened_to_vmcs12(vmx);
3346                 /* Enlightened VMCS doesn't have launch state */
3347                 vmcs12->launch_state = !launch;
3348         } else if (enable_shadow_vmcs) {
3349                 copy_shadow_to_vmcs12(vmx);
3350         }
3351
3352         /*
3353          * The nested entry process starts with enforcing various prerequisites
3354          * on vmcs12 as required by the Intel SDM, and act appropriately when
3355          * they fail: As the SDM explains, some conditions should cause the
3356          * instruction to fail, while others will cause the instruction to seem
3357          * to succeed, but return an EXIT_REASON_INVALID_STATE.
3358          * To speed up the normal (success) code path, we should avoid checking
3359          * for misconfigurations which will anyway be caught by the processor
3360          * when using the merged vmcs02.
3361          */
3362         if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
3363                 return nested_vmx_failValid(vcpu,
3364                         VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
3365
3366         if (vmcs12->launch_state == launch)
3367                 return nested_vmx_failValid(vcpu,
3368                         launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3369                                : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3370
3371         if (nested_vmx_check_controls(vcpu, vmcs12))
3372                 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3373
3374         if (nested_vmx_check_host_state(vcpu, vmcs12))
3375                 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3376
3377         /*
3378          * We're finally done with prerequisite checking, and can start with
3379          * the nested entry.
3380          */
3381         vmx->nested.nested_run_pending = 1;
3382         ret = nested_vmx_enter_non_root_mode(vcpu, true);
3383         vmx->nested.nested_run_pending = !ret;
3384         if (ret > 0)
3385                 return 1;
3386         else if (ret)
3387                 return nested_vmx_failValid(vcpu,
3388                         VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3389
3390         /* Hide L1D cache contents from the nested guest.  */
3391         vmx->vcpu.arch.l1tf_flush_l1d = true;
3392
3393         /*
3394          * Must happen outside of nested_vmx_enter_non_root_mode() as it will
3395          * also be used as part of restoring nVMX state for
3396          * snapshot restore (migration).
3397          *
3398          * In this flow, it is assumed that vmcs12 cache was
3399          * trasferred as part of captured nVMX state and should
3400          * therefore not be read from guest memory (which may not
3401          * exist on destination host yet).
3402          */
3403         nested_cache_shadow_vmcs12(vcpu, vmcs12);
3404
3405         /*
3406          * If we're entering a halted L2 vcpu and the L2 vcpu won't be
3407          * awakened by event injection or by an NMI-window VM-exit or
3408          * by an interrupt-window VM-exit, halt the vcpu.
3409          */
3410         if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
3411             !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
3412             !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) &&
3413             !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) &&
3414               (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
3415                 vmx->nested.nested_run_pending = 0;
3416                 return kvm_vcpu_halt(vcpu);
3417         }
3418         return 1;
3419 }
3420
3421 /*
3422  * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
3423  * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
3424  * This function returns the new value we should put in vmcs12.guest_cr0.
3425  * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3426  *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3427  *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3428  *     didn't trap the bit, because if L1 did, so would L0).
3429  *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3430  *     been modified by L2, and L1 knows it. So just leave the old value of
3431  *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3432  *     isn't relevant, because if L0 traps this bit it can set it to anything.
3433  *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3434  *     changed these bits, and therefore they need to be updated, but L0
3435  *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3436  *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3437  */
3438 static inline unsigned long
3439 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3440 {
3441         return
3442         /*1*/   (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
3443         /*2*/   (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
3444         /*3*/   (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
3445                         vcpu->arch.cr0_guest_owned_bits));
3446 }
3447
3448 static inline unsigned long
3449 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3450 {
3451         return
3452         /*1*/   (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
3453         /*2*/   (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
3454         /*3*/   (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
3455                         vcpu->arch.cr4_guest_owned_bits));
3456 }
3457
3458 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
3459                                       struct vmcs12 *vmcs12)
3460 {
3461         u32 idt_vectoring;
3462         unsigned int nr;
3463
3464         if (vcpu->arch.exception.injected) {
3465                 nr = vcpu->arch.exception.nr;
3466                 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3467
3468                 if (kvm_exception_is_soft(nr)) {
3469                         vmcs12->vm_exit_instruction_len =
3470                                 vcpu->arch.event_exit_inst_len;
3471                         idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
3472                 } else
3473                         idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
3474
3475                 if (vcpu->arch.exception.has_error_code) {
3476                         idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
3477                         vmcs12->idt_vectoring_error_code =
3478                                 vcpu->arch.exception.error_code;
3479                 }
3480
3481                 vmcs12->idt_vectoring_info_field = idt_vectoring;
3482         } else if (vcpu->arch.nmi_injected) {
3483                 vmcs12->idt_vectoring_info_field =
3484                         INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
3485         } else if (vcpu->arch.interrupt.injected) {
3486                 nr = vcpu->arch.interrupt.nr;
3487                 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3488
3489                 if (vcpu->arch.interrupt.soft) {
3490                         idt_vectoring |= INTR_TYPE_SOFT_INTR;
3491                         vmcs12->vm_entry_instruction_len =
3492                                 vcpu->arch.event_exit_inst_len;
3493                 } else
3494                         idt_vectoring |= INTR_TYPE_EXT_INTR;
3495
3496                 vmcs12->idt_vectoring_info_field = idt_vectoring;
3497         }
3498 }
3499
3500
3501 static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
3502 {
3503         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3504         gfn_t gfn;
3505
3506         /*
3507          * Don't need to mark the APIC access page dirty; it is never
3508          * written to by the CPU during APIC virtualization.
3509          */
3510
3511         if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3512                 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
3513                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3514         }
3515
3516         if (nested_cpu_has_posted_intr(vmcs12)) {
3517                 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
3518                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3519         }
3520 }
3521
3522 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
3523 {
3524         struct vcpu_vmx *vmx = to_vmx(vcpu);
3525         int max_irr;
3526         void *vapic_page;
3527         u16 status;
3528
3529         if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
3530                 return;
3531
3532         vmx->nested.pi_pending = false;
3533         if (!pi_test_and_clear_on(vmx->nested.pi_desc))
3534                 return;
3535
3536         max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
3537         if (max_irr != 256) {
3538                 vapic_page = vmx->nested.virtual_apic_map.hva;
3539                 if (!vapic_page)
3540                         return;
3541
3542                 __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
3543                         vapic_page, &max_irr);
3544                 status = vmcs_read16(GUEST_INTR_STATUS);
3545                 if ((u8)max_irr > ((u8)status & 0xff)) {
3546                         status &= ~0xff;
3547                         status |= (u8)max_irr;
3548                         vmcs_write16(GUEST_INTR_STATUS, status);
3549                 }
3550         }
3551
3552         nested_mark_vmcs12_pages_dirty(vcpu);
3553 }
3554
3555 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3556                                                unsigned long exit_qual)
3557 {
3558         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3559         unsigned int nr = vcpu->arch.exception.nr;
3560         u32 intr_info = nr | INTR_INFO_VALID_MASK;
3561
3562         if (vcpu->arch.exception.has_error_code) {
3563                 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
3564                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3565         }
3566
3567         if (kvm_exception_is_soft(nr))
3568                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3569         else
3570                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
3571
3572         if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3573             vmx_get_nmi_mask(vcpu))
3574                 intr_info |= INTR_INFO_UNBLOCK_NMI;
3575
3576         nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3577 }
3578
3579 static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
3580 {
3581         struct vcpu_vmx *vmx = to_vmx(vcpu);
3582         unsigned long exit_qual;
3583         bool block_nested_events =
3584             vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
3585         struct kvm_lapic *apic = vcpu->arch.apic;
3586
3587         if (lapic_in_kernel(vcpu) &&
3588                 test_bit(KVM_APIC_INIT, &apic->pending_events)) {
3589                 if (block_nested_events)
3590                         return -EBUSY;
3591                 clear_bit(KVM_APIC_INIT, &apic->pending_events);
3592                 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
3593                 return 0;
3594         }
3595
3596         if (vcpu->arch.exception.pending &&
3597                 nested_vmx_check_exception(vcpu, &exit_qual)) {
3598                 if (block_nested_events)
3599                         return -EBUSY;
3600                 nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3601                 return 0;
3602         }
3603
3604         if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
3605             vmx->nested.preemption_timer_expired) {
3606                 if (block_nested_events)
3607                         return -EBUSY;
3608                 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
3609                 return 0;
3610         }
3611
3612         if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
3613                 if (block_nested_events)
3614                         return -EBUSY;
3615                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
3616                                   NMI_VECTOR | INTR_TYPE_NMI_INTR |
3617                                   INTR_INFO_VALID_MASK, 0);
3618                 /*
3619                  * The NMI-triggered VM exit counts as injection:
3620                  * clear this one and block further NMIs.
3621                  */
3622                 vcpu->arch.nmi_pending = 0;
3623                 vmx_set_nmi_mask(vcpu, true);
3624                 return 0;
3625         }
3626
3627         if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
3628             nested_exit_on_intr(vcpu)) {
3629                 if (block_nested_events)
3630                         return -EBUSY;
3631                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
3632                 return 0;
3633         }
3634
3635         vmx_complete_nested_posted_interrupt(vcpu);
3636         return 0;
3637 }
3638
3639 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
3640 {
3641         ktime_t remaining =
3642                 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
3643         u64 value;
3644
3645         if (ktime_to_ns(remaining) <= 0)
3646                 return 0;
3647
3648         value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
3649         do_div(value, 1000000);
3650         return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
3651 }
3652
3653 static bool is_vmcs12_ext_field(unsigned long field)
3654 {
3655         switch (field) {
3656         case GUEST_ES_SELECTOR:
3657         case GUEST_CS_SELECTOR:
3658         case GUEST_SS_SELECTOR:
3659         case GUEST_DS_SELECTOR:
3660         case GUEST_FS_SELECTOR:
3661         case GUEST_GS_SELECTOR:
3662         case GUEST_LDTR_SELECTOR:
3663         case GUEST_TR_SELECTOR:
3664         case GUEST_ES_LIMIT:
3665         case GUEST_CS_LIMIT:
3666         case GUEST_SS_LIMIT:
3667         case GUEST_DS_LIMIT:
3668         case GUEST_FS_LIMIT:
3669         case GUEST_GS_LIMIT:
3670         case GUEST_LDTR_LIMIT:
3671         case GUEST_TR_LIMIT:
3672         case GUEST_GDTR_LIMIT:
3673         case GUEST_IDTR_LIMIT:
3674         case GUEST_ES_AR_BYTES:
3675         case GUEST_DS_AR_BYTES:
3676         case GUEST_FS_AR_BYTES:
3677         case GUEST_GS_AR_BYTES:
3678         case GUEST_LDTR_AR_BYTES:
3679         case GUEST_TR_AR_BYTES:
3680         case GUEST_ES_BASE:
3681         case GUEST_CS_BASE:
3682         case GUEST_SS_BASE:
3683         case GUEST_DS_BASE:
3684         case GUEST_FS_BASE:
3685         case GUEST_GS_BASE:
3686         case GUEST_LDTR_BASE:
3687         case GUEST_TR_BASE:
3688         case GUEST_GDTR_BASE:
3689         case GUEST_IDTR_BASE:
3690         case GUEST_PENDING_DBG_EXCEPTIONS:
3691         case GUEST_BNDCFGS:
3692                 return true;
3693         default:
3694                 break;
3695         }
3696
3697         return false;
3698 }
3699
3700 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
3701                                        struct vmcs12 *vmcs12)
3702 {
3703         struct vcpu_vmx *vmx = to_vmx(vcpu);
3704
3705         vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
3706         vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
3707         vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
3708         vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
3709         vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
3710         vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
3711         vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
3712         vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
3713         vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
3714         vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
3715         vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
3716         vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
3717         vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
3718         vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
3719         vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
3720         vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
3721         vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
3722         vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
3723         vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
3724         vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
3725         vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
3726         vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
3727         vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
3728         vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
3729         vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
3730         vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
3731         vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
3732         vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
3733         vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
3734         vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
3735         vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
3736         vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
3737         vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
3738         vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
3739         vmcs12->guest_pending_dbg_exceptions =
3740                 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
3741         if (kvm_mpx_supported())
3742                 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3743
3744         vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
3745 }
3746
3747 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
3748                                        struct vmcs12 *vmcs12)
3749 {
3750         struct vcpu_vmx *vmx = to_vmx(vcpu);
3751         int cpu;
3752
3753         if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
3754                 return;
3755
3756
3757         WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
3758
3759         cpu = get_cpu();
3760         vmx->loaded_vmcs = &vmx->nested.vmcs02;
3761         vmx_vcpu_load(&vmx->vcpu, cpu);
3762
3763         sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
3764
3765         vmx->loaded_vmcs = &vmx->vmcs01;
3766         vmx_vcpu_load(&vmx->vcpu, cpu);
3767         put_cpu();
3768 }
3769
3770 /*
3771  * Update the guest state fields of vmcs12 to reflect changes that
3772  * occurred while L2 was running. (The "IA-32e mode guest" bit of the
3773  * VM-entry controls is also updated, since this is really a guest
3774  * state bit.)
3775  */
3776 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3777 {
3778         struct vcpu_vmx *vmx = to_vmx(vcpu);
3779
3780         if (vmx->nested.hv_evmcs)
3781                 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
3782
3783         vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs;
3784
3785         vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
3786         vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
3787
3788         vmcs12->guest_rsp = kvm_rsp_read(vcpu);
3789         vmcs12->guest_rip = kvm_rip_read(vcpu);
3790         vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
3791
3792         vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
3793         vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
3794
3795         vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
3796         vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
3797         vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
3798
3799         vmcs12->guest_interruptibility_info =
3800                 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3801
3802         if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
3803                 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
3804         else
3805                 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
3806
3807         if (nested_cpu_has_preemption_timer(vmcs12) &&
3808             vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
3809                         vmcs12->vmx_preemption_timer_value =
3810                                 vmx_get_preemption_timer_value(vcpu);
3811
3812         /*
3813          * In some cases (usually, nested EPT), L2 is allowed to change its
3814          * own CR3 without exiting. If it has changed it, we must keep it.
3815          * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
3816          * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
3817          *
3818          * Additionally, restore L2's PDPTR to vmcs12.
3819          */
3820         if (enable_ept) {
3821                 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
3822                 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
3823                         vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
3824                         vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
3825                         vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
3826                         vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
3827                 }
3828         }
3829
3830         vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
3831
3832         if (nested_cpu_has_vid(vmcs12))
3833                 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
3834
3835         vmcs12->vm_entry_controls =
3836                 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
3837                 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
3838
3839         if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
3840                 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
3841
3842         if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
3843                 vmcs12->guest_ia32_efer = vcpu->arch.efer;
3844 }
3845
3846 /*
3847  * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
3848  * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
3849  * and this function updates it to reflect the changes to the guest state while
3850  * L2 was running (and perhaps made some exits which were handled directly by L0
3851  * without going back to L1), and to reflect the exit reason.
3852  * Note that we do not have to copy here all VMCS fields, just those that
3853  * could have changed by the L2 guest or the exit - i.e., the guest-state and
3854  * exit-information fields only. Other fields are modified by L1 with VMWRITE,
3855  * which already writes to vmcs12 directly.
3856  */
3857 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
3858                            u32 exit_reason, u32 exit_intr_info,
3859                            unsigned long exit_qualification)
3860 {
3861         /* update exit information fields: */
3862         vmcs12->vm_exit_reason = exit_reason;
3863         vmcs12->exit_qualification = exit_qualification;
3864         vmcs12->vm_exit_intr_info = exit_intr_info;
3865
3866         vmcs12->idt_vectoring_info_field = 0;
3867         vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3868         vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
3869
3870         if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
3871                 vmcs12->launch_state = 1;
3872
3873                 /* vm_entry_intr_info_field is cleared on exit. Emulate this
3874                  * instead of reading the real value. */
3875                 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
3876
3877                 /*
3878                  * Transfer the event that L0 or L1 may wanted to inject into
3879                  * L2 to IDT_VECTORING_INFO_FIELD.
3880                  */
3881                 vmcs12_save_pending_event(vcpu, vmcs12);
3882
3883                 /*
3884                  * According to spec, there's no need to store the guest's
3885                  * MSRs if the exit is due to a VM-entry failure that occurs
3886                  * during or after loading the guest state. Since this exit
3887                  * does not fall in that category, we need to save the MSRs.
3888                  */
3889                 if (nested_vmx_store_msr(vcpu,
3890                                          vmcs12->vm_exit_msr_store_addr,
3891                                          vmcs12->vm_exit_msr_store_count))
3892                         nested_vmx_abort(vcpu,
3893                                          VMX_ABORT_SAVE_GUEST_MSR_FAIL);
3894         }
3895
3896         /*
3897          * Drop what we picked up for L2 via vmx_complete_interrupts. It is
3898          * preserved above and would only end up incorrectly in L1.
3899          */
3900         vcpu->arch.nmi_injected = false;
3901         kvm_clear_exception_queue(vcpu);
3902         kvm_clear_interrupt_queue(vcpu);
3903 }
3904
3905 /*
3906  * A part of what we need to when the nested L2 guest exits and we want to
3907  * run its L1 parent, is to reset L1's guest state to the host state specified
3908  * in vmcs12.
3909  * This function is to be called not only on normal nested exit, but also on
3910  * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
3911  * Failures During or After Loading Guest State").
3912  * This function should be called when the active VMCS is L1's (vmcs01).
3913  */
3914 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3915                                    struct vmcs12 *vmcs12)
3916 {
3917         struct kvm_segment seg;
3918         u32 entry_failure_code;
3919
3920         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
3921                 vcpu->arch.efer = vmcs12->host_ia32_efer;
3922         else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
3923                 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
3924         else
3925                 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
3926         vmx_set_efer(vcpu, vcpu->arch.efer);
3927
3928         kvm_rsp_write(vcpu, vmcs12->host_rsp);
3929         kvm_rip_write(vcpu, vmcs12->host_rip);
3930         vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
3931         vmx_set_interrupt_shadow(vcpu, 0);
3932
3933         /*
3934          * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
3935          * actually changed, because vmx_set_cr0 refers to efer set above.
3936          *
3937          * CR0_GUEST_HOST_MASK is already set in the original vmcs01
3938          * (KVM doesn't change it);
3939          */
3940         vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
3941         vmx_set_cr0(vcpu, vmcs12->host_cr0);
3942
3943         /* Same as above - no reason to call set_cr4_guest_host_mask().  */
3944         vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
3945         vmx_set_cr4(vcpu, vmcs12->host_cr4);
3946
3947         nested_ept_uninit_mmu_context(vcpu);
3948
3949         /*
3950          * Only PDPTE load can fail as the value of cr3 was checked on entry and
3951          * couldn't have changed.
3952          */
3953         if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
3954                 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
3955
3956         if (!enable_ept)
3957                 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
3958
3959         /*
3960          * If vmcs01 doesn't use VPID, CPU flushes TLB on every
3961          * VMEntry/VMExit. Thus, no need to flush TLB.
3962          *
3963          * If vmcs12 doesn't use VPID, L1 expects TLB to be
3964          * flushed on every VMEntry/VMExit.
3965          *
3966          * Otherwise, we can preserve TLB entries as long as we are
3967          * able to tag L1 TLB entries differently than L2 TLB entries.
3968          *
3969          * If vmcs12 uses EPT, we need to execute this flush on EPTP01
3970          * and therefore we request the TLB flush to happen only after VMCS EPTP
3971          * has been set by KVM_REQ_LOAD_CR3.
3972          */
3973         if (enable_vpid &&
3974             (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
3975                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3976         }
3977
3978         vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
3979         vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
3980         vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
3981         vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
3982         vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
3983         vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
3984         vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
3985
3986         /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
3987         if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
3988                 vmcs_write64(GUEST_BNDCFGS, 0);
3989
3990         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
3991                 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
3992                 vcpu->arch.pat = vmcs12->host_ia32_pat;
3993         }
3994         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
3995                 SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
3996                                 vmcs12->host_ia32_perf_global_ctrl);
3997
3998         /* Set L1 segment info according to Intel SDM
3999             27.5.2 Loading Host Segment and Descriptor-Table Registers */
4000         seg = (struct kvm_segment) {
4001                 .base = 0,
4002                 .limit = 0xFFFFFFFF,
4003                 .selector = vmcs12->host_cs_selector,
4004                 .type = 11,
4005                 .present = 1,
4006                 .s = 1,
4007                 .g = 1
4008         };
4009         if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4010                 seg.l = 1;
4011         else
4012                 seg.db = 1;
4013         vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
4014         seg = (struct kvm_segment) {
4015                 .base = 0,
4016                 .limit = 0xFFFFFFFF,
4017                 .type = 3,
4018                 .present = 1,
4019                 .s = 1,
4020                 .db = 1,
4021                 .g = 1
4022         };
4023         seg.selector = vmcs12->host_ds_selector;
4024         vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
4025         seg.selector = vmcs12->host_es_selector;
4026         vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
4027         seg.selector = vmcs12->host_ss_selector;
4028         vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
4029         seg.selector = vmcs12->host_fs_selector;
4030         seg.base = vmcs12->host_fs_base;
4031         vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
4032         seg.selector = vmcs12->host_gs_selector;
4033         seg.base = vmcs12->host_gs_base;
4034         vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
4035         seg = (struct kvm_segment) {
4036                 .base = vmcs12->host_tr_base,
4037                 .limit = 0x67,
4038                 .selector = vmcs12->host_tr_selector,
4039                 .type = 11,
4040                 .present = 1
4041         };
4042         vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
4043
4044         kvm_set_dr(vcpu, 7, 0x400);
4045         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4046
4047         if (cpu_has_vmx_msr_bitmap())
4048                 vmx_update_msr_bitmap(vcpu);
4049
4050         if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
4051                                 vmcs12->vm_exit_msr_load_count))
4052                 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4053 }
4054
4055 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
4056 {
4057         struct shared_msr_entry *efer_msr;
4058         unsigned int i;
4059
4060         if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
4061                 return vmcs_read64(GUEST_IA32_EFER);
4062
4063         if (cpu_has_load_ia32_efer())
4064                 return host_efer;
4065
4066         for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
4067                 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
4068                         return vmx->msr_autoload.guest.val[i].value;
4069         }
4070
4071         efer_msr = find_msr_entry(vmx, MSR_EFER);
4072         if (efer_msr)
4073                 return efer_msr->data;
4074
4075         return host_efer;
4076 }
4077
4078 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
4079 {
4080         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4081         struct vcpu_vmx *vmx = to_vmx(vcpu);
4082         struct vmx_msr_entry g, h;
4083         gpa_t gpa;
4084         u32 i, j;
4085
4086         vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
4087
4088         if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
4089                 /*
4090                  * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
4091                  * as vmcs01.GUEST_DR7 contains a userspace defined value
4092                  * and vcpu->arch.dr7 is not squirreled away before the
4093                  * nested VMENTER (not worth adding a variable in nested_vmx).
4094                  */
4095                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
4096                         kvm_set_dr(vcpu, 7, DR7_FIXED_1);
4097                 else
4098                         WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
4099         }
4100
4101         /*
4102          * Note that calling vmx_set_{efer,cr0,cr4} is important as they
4103          * handle a variety of side effects to KVM's software model.
4104          */
4105         vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
4106
4107         vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
4108         vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
4109
4110         vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4111         vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
4112
4113         nested_ept_uninit_mmu_context(vcpu);
4114         vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
4115         kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
4116
4117         /*
4118          * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
4119          * from vmcs01 (if necessary).  The PDPTRs are not loaded on
4120          * VMFail, like everything else we just need to ensure our
4121          * software model is up-to-date.
4122          */
4123         if (enable_ept)
4124                 ept_save_pdptrs(vcpu);
4125
4126         kvm_mmu_reset_context(vcpu);
4127
4128         if (cpu_has_vmx_msr_bitmap())
4129                 vmx_update_msr_bitmap(vcpu);
4130
4131         /*
4132          * This nasty bit of open coding is a compromise between blindly
4133          * loading L1's MSRs using the exit load lists (incorrect emulation
4134          * of VMFail), leaving the nested VM's MSRs in the software model
4135          * (incorrect behavior) and snapshotting the modified MSRs (too
4136          * expensive since the lists are unbound by hardware).  For each
4137          * MSR that was (prematurely) loaded from the nested VMEntry load
4138          * list, reload it from the exit load list if it exists and differs
4139          * from the guest value.  The intent is to stuff host state as
4140          * silently as possible, not to fully process the exit load list.
4141          */
4142         for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
4143                 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
4144                 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
4145                         pr_debug_ratelimited(
4146                                 "%s read MSR index failed (%u, 0x%08llx)\n",
4147                                 __func__, i, gpa);
4148                         goto vmabort;
4149                 }
4150
4151                 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
4152                         gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
4153                         if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
4154                                 pr_debug_ratelimited(
4155                                         "%s read MSR failed (%u, 0x%08llx)\n",
4156                                         __func__, j, gpa);
4157                                 goto vmabort;
4158                         }
4159                         if (h.index != g.index)
4160                                 continue;
4161                         if (h.value == g.value)
4162                                 break;
4163
4164                         if (nested_vmx_load_msr_check(vcpu, &h)) {
4165                                 pr_debug_ratelimited(
4166                                         "%s check failed (%u, 0x%x, 0x%x)\n",
4167                                         __func__, j, h.index, h.reserved);
4168                                 goto vmabort;
4169                         }
4170
4171                         if (kvm_set_msr(vcpu, h.index, h.value)) {
4172                                 pr_debug_ratelimited(
4173                                         "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
4174                                         __func__, j, h.index, h.value);
4175                                 goto vmabort;
4176                         }
4177                 }
4178         }
4179
4180         return;
4181
4182 vmabort:
4183         nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4184 }
4185
4186 /*
4187  * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
4188  * and modify vmcs12 to make it see what it would expect to see there if
4189  * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
4190  */
4191 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
4192                        u32 exit_intr_info, unsigned long exit_qualification)
4193 {
4194         struct vcpu_vmx *vmx = to_vmx(vcpu);
4195         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4196
4197         /* trying to cancel vmlaunch/vmresume is a bug */
4198         WARN_ON_ONCE(vmx->nested.nested_run_pending);
4199
4200         leave_guest_mode(vcpu);
4201
4202         if (nested_cpu_has_preemption_timer(vmcs12))
4203                 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
4204
4205         if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
4206                 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
4207
4208         if (likely(!vmx->fail)) {
4209                 sync_vmcs02_to_vmcs12(vcpu, vmcs12);
4210
4211                 if (exit_reason != -1)
4212                         prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
4213                                        exit_qualification);
4214
4215                 /*
4216                  * Must happen outside of sync_vmcs02_to_vmcs12() as it will
4217                  * also be used to capture vmcs12 cache as part of
4218                  * capturing nVMX state for snapshot (migration).
4219                  *
4220                  * Otherwise, this flush will dirty guest memory at a
4221                  * point it is already assumed by user-space to be
4222                  * immutable.
4223                  */
4224                 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
4225         } else {
4226                 /*
4227                  * The only expected VM-instruction error is "VM entry with
4228                  * invalid control field(s)." Anything else indicates a
4229                  * problem with L0.  And we should never get here with a
4230                  * VMFail of any type if early consistency checks are enabled.
4231                  */
4232                 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
4233                              VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4234                 WARN_ON_ONCE(nested_early_check);
4235         }
4236
4237         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
4238
4239         /* Update any VMCS fields that might have changed while L2 ran */
4240         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
4241         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
4242         vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
4243         if (vmx->nested.l1_tpr_threshold != -1)
4244                 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
4245
4246         if (kvm_has_tsc_control)
4247                 decache_tsc_multiplier(vmx);
4248
4249         if (vmx->nested.change_vmcs01_virtual_apic_mode) {
4250                 vmx->nested.change_vmcs01_virtual_apic_mode = false;
4251                 vmx_set_virtual_apic_mode(vcpu);
4252         } else if (!nested_cpu_has_ept(vmcs12) &&
4253                    nested_cpu_has2(vmcs12,
4254                                    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
4255                 vmx_flush_tlb(vcpu, true);
4256         }
4257
4258         /* Unpin physical memory we referred to in vmcs02 */
4259         if (vmx->nested.apic_access_page) {
4260                 kvm_release_page_dirty(vmx->nested.apic_access_page);
4261                 vmx->nested.apic_access_page = NULL;
4262         }
4263         kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
4264         kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
4265         vmx->nested.pi_desc = NULL;
4266
4267         /*
4268          * We are now running in L2, mmu_notifier will force to reload the
4269          * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
4270          */
4271         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4272
4273         if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
4274                 vmx->nested.need_vmcs12_to_shadow_sync = true;
4275
4276         /* in case we halted in L2 */
4277         vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4278
4279         if (likely(!vmx->fail)) {
4280                 /*
4281                  * TODO: SDM says that with acknowledge interrupt on
4282                  * exit, bit 31 of the VM-exit interrupt information
4283                  * (valid interrupt) is always set to 1 on
4284                  * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
4285                  * need kvm_cpu_has_interrupt().  See the commit
4286                  * message for details.
4287                  */
4288                 if (nested_exit_intr_ack_set(vcpu) &&
4289                     exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
4290                     kvm_cpu_has_interrupt(vcpu)) {
4291                         int irq = kvm_cpu_get_interrupt(vcpu);
4292                         WARN_ON(irq < 0);
4293                         vmcs12->vm_exit_intr_info = irq |
4294                                 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
4295                 }
4296
4297                 if (exit_reason != -1)
4298                         trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
4299                                                        vmcs12->exit_qualification,
4300                                                        vmcs12->idt_vectoring_info_field,
4301                                                        vmcs12->vm_exit_intr_info,
4302                                                        vmcs12->vm_exit_intr_error_code,
4303                                                        KVM_ISA_VMX);
4304
4305                 load_vmcs12_host_state(vcpu, vmcs12);
4306
4307                 return;
4308         }
4309
4310         /*
4311          * After an early L2 VM-entry failure, we're now back
4312          * in L1 which thinks it just finished a VMLAUNCH or
4313          * VMRESUME instruction, so we need to set the failure
4314          * flag and the VM-instruction error field of the VMCS
4315          * accordingly, and skip the emulated instruction.
4316          */
4317         (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4318
4319         /*
4320          * Restore L1's host state to KVM's software model.  We're here
4321          * because a consistency check was caught by hardware, which
4322          * means some amount of guest state has been propagated to KVM's
4323          * model and needs to be unwound to the host's state.
4324          */
4325         nested_vmx_restore_host_state(vcpu);
4326
4327         vmx->fail = 0;
4328 }
4329
4330 /*
4331  * Decode the memory-address operand of a vmx instruction, as recorded on an
4332  * exit caused by such an instruction (run by a guest hypervisor).
4333  * On success, returns 0. When the operand is invalid, returns 1 and throws
4334  * #UD or #GP.
4335  */
4336 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4337                         u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
4338 {
4339         gva_t off;
4340         bool exn;
4341         struct kvm_segment s;
4342
4343         /*
4344          * According to Vol. 3B, "Information for VM Exits Due to Instruction
4345          * Execution", on an exit, vmx_instruction_info holds most of the
4346          * addressing components of the operand. Only the displacement part
4347          * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4348          * For how an actual address is calculated from all these components,
4349          * refer to Vol. 1, "Operand Addressing".
4350          */
4351         int  scaling = vmx_instruction_info & 3;
4352         int  addr_size = (vmx_instruction_info >> 7) & 7;
4353         bool is_reg = vmx_instruction_info & (1u << 10);
4354         int  seg_reg = (vmx_instruction_info >> 15) & 7;
4355         int  index_reg = (vmx_instruction_info >> 18) & 0xf;
4356         bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4357         int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
4358         bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
4359
4360         if (is_reg) {
4361                 kvm_queue_exception(vcpu, UD_VECTOR);
4362                 return 1;
4363         }
4364
4365         /* Addr = segment_base + offset */
4366         /* offset = base + [index * scale] + displacement */
4367         off = exit_qualification; /* holds the displacement */
4368         if (addr_size == 1)
4369                 off = (gva_t)sign_extend64(off, 31);
4370         else if (addr_size == 0)
4371                 off = (gva_t)sign_extend64(off, 15);
4372         if (base_is_valid)
4373                 off += kvm_register_read(vcpu, base_reg);
4374         if (index_is_valid)
4375                 off += kvm_register_read(vcpu, index_reg)<<scaling;
4376         vmx_get_segment(vcpu, &s, seg_reg);
4377
4378         /*
4379          * The effective address, i.e. @off, of a memory operand is truncated
4380          * based on the address size of the instruction.  Note that this is
4381          * the *effective address*, i.e. the address prior to accounting for
4382          * the segment's base.
4383          */
4384         if (addr_size == 1) /* 32 bit */
4385                 off &= 0xffffffff;
4386         else if (addr_size == 0) /* 16 bit */
4387                 off &= 0xffff;
4388
4389         /* Checks for #GP/#SS exceptions. */
4390         exn = false;
4391         if (is_long_mode(vcpu)) {
4392                 /*
4393                  * The virtual/linear address is never truncated in 64-bit
4394                  * mode, e.g. a 32-bit address size can yield a 64-bit virtual
4395                  * address when using FS/GS with a non-zero base.
4396                  */
4397                 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
4398                         *ret = s.base + off;
4399                 else
4400                         *ret = off;
4401
4402                 /* Long mode: #GP(0)/#SS(0) if the memory address is in a
4403                  * non-canonical form. This is the only check on the memory
4404                  * destination for long mode!
4405                  */
4406                 exn = is_noncanonical_address(*ret, vcpu);
4407         } else {
4408                 /*
4409                  * When not in long mode, the virtual/linear address is
4410                  * unconditionally truncated to 32 bits regardless of the
4411                  * address size.
4412                  */
4413                 *ret = (s.base + off) & 0xffffffff;
4414
4415                 /* Protected mode: apply checks for segment validity in the
4416                  * following order:
4417                  * - segment type check (#GP(0) may be thrown)
4418                  * - usability check (#GP(0)/#SS(0))
4419                  * - limit check (#GP(0)/#SS(0))
4420                  */
4421                 if (wr)
4422                         /* #GP(0) if the destination operand is located in a
4423                          * read-only data segment or any code segment.
4424                          */
4425                         exn = ((s.type & 0xa) == 0 || (s.type & 8));
4426                 else
4427                         /* #GP(0) if the source operand is located in an
4428                          * execute-only code segment
4429                          */
4430                         exn = ((s.type & 0xa) == 8);
4431                 if (exn) {
4432                         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4433                         return 1;
4434                 }
4435                 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
4436                  */
4437                 exn = (s.unusable != 0);
4438
4439                 /*
4440                  * Protected mode: #GP(0)/#SS(0) if the memory operand is
4441                  * outside the segment limit.  All CPUs that support VMX ignore
4442                  * limit checks for flat segments, i.e. segments with base==0,
4443                  * limit==0xffffffff and of type expand-up data or code.
4444                  */
4445                 if (!(s.base == 0 && s.limit == 0xffffffff &&
4446                      ((s.type & 8) || !(s.type & 4))))
4447                         exn = exn || ((u64)off + len - 1 > s.limit);
4448         }
4449         if (exn) {
4450                 kvm_queue_exception_e(vcpu,
4451                                       seg_reg == VCPU_SREG_SS ?
4452                                                 SS_VECTOR : GP_VECTOR,
4453                                       0);
4454                 return 1;
4455         }
4456
4457         return 0;
4458 }
4459
4460 void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
4461 {
4462         struct vcpu_vmx *vmx;
4463
4464         if (!nested_vmx_allowed(vcpu))
4465                 return;
4466
4467         vmx = to_vmx(vcpu);
4468         if (kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) {
4469                 vmx->nested.msrs.entry_ctls_high |=
4470                                 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4471                 vmx->nested.msrs.exit_ctls_high |=
4472                                 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4473         } else {
4474                 vmx->nested.msrs.entry_ctls_high &=
4475                                 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4476                 vmx->nested.msrs.exit_ctls_high &=
4477                                 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4478         }
4479 }
4480
4481 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
4482 {
4483         gva_t gva;
4484         struct x86_exception e;
4485
4486         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4487                                 vmcs_read32(VMX_INSTRUCTION_INFO), false,
4488                                 sizeof(*vmpointer), &gva))
4489                 return 1;
4490
4491         if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
4492                 kvm_inject_page_fault(vcpu, &e);
4493                 return 1;
4494         }
4495
4496         return 0;
4497 }
4498
4499 /*
4500  * Allocate a shadow VMCS and associate it with the currently loaded
4501  * VMCS, unless such a shadow VMCS already exists. The newly allocated
4502  * VMCS is also VMCLEARed, so that it is ready for use.
4503  */
4504 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
4505 {
4506         struct vcpu_vmx *vmx = to_vmx(vcpu);
4507         struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
4508
4509         /*
4510          * We should allocate a shadow vmcs for vmcs01 only when L1
4511          * executes VMXON and free it when L1 executes VMXOFF.
4512          * As it is invalid to execute VMXON twice, we shouldn't reach
4513          * here when vmcs01 already have an allocated shadow vmcs.
4514          */
4515         WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
4516
4517         if (!loaded_vmcs->shadow_vmcs) {
4518                 loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
4519                 if (loaded_vmcs->shadow_vmcs)
4520                         vmcs_clear(loaded_vmcs->shadow_vmcs);
4521         }
4522         return loaded_vmcs->shadow_vmcs;
4523 }
4524
4525 static int enter_vmx_operation(struct kvm_vcpu *vcpu)
4526 {
4527         struct vcpu_vmx *vmx = to_vmx(vcpu);
4528         int r;
4529
4530         r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
4531         if (r < 0)
4532                 goto out_vmcs02;
4533
4534         vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4535         if (!vmx->nested.cached_vmcs12)
4536                 goto out_cached_vmcs12;
4537
4538         vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4539         if (!vmx->nested.cached_shadow_vmcs12)
4540                 goto out_cached_shadow_vmcs12;
4541
4542         if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
4543                 goto out_shadow_vmcs;
4544
4545         hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
4546                      HRTIMER_MODE_REL_PINNED);
4547         vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
4548
4549         vmx->nested.vpid02 = allocate_vpid();
4550
4551         vmx->nested.vmcs02_initialized = false;
4552         vmx->nested.vmxon = true;
4553
4554         if (pt_mode == PT_MODE_HOST_GUEST) {
4555                 vmx->pt_desc.guest.ctl = 0;
4556                 pt_update_intercept_for_msr(vmx);
4557         }
4558
4559         return 0;
4560
4561 out_shadow_vmcs:
4562         kfree(vmx->nested.cached_shadow_vmcs12);
4563
4564 out_cached_shadow_vmcs12:
4565         kfree(vmx->nested.cached_vmcs12);
4566
4567 out_cached_vmcs12:
4568         free_loaded_vmcs(&vmx->nested.vmcs02);
4569
4570 out_vmcs02:
4571         return -ENOMEM;
4572 }
4573
4574 /*
4575  * Emulate the VMXON instruction.
4576  * Currently, we just remember that VMX is active, and do not save or even
4577  * inspect the argument to VMXON (the so-called "VMXON pointer") because we
4578  * do not currently need to store anything in that guest-allocated memory
4579  * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
4580  * argument is different from the VMXON pointer (which the spec says they do).
4581  */
4582 static int handle_vmon(struct kvm_vcpu *vcpu)
4583 {
4584         int ret;
4585         gpa_t vmptr;
4586         uint32_t revision;
4587         struct vcpu_vmx *vmx = to_vmx(vcpu);
4588         const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
4589                 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
4590
4591         /*
4592          * The Intel VMX Instruction Reference lists a bunch of bits that are
4593          * prerequisite to running VMXON, most notably cr4.VMXE must be set to
4594          * 1 (see vmx_set_cr4() for when we allow the guest to set this).
4595          * Otherwise, we should fail with #UD.  But most faulting conditions
4596          * have already been checked by hardware, prior to the VM-exit for
4597          * VMXON.  We do test guest cr4.VMXE because processor CR4 always has
4598          * that bit set to 1 in non-root mode.
4599          */
4600         if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
4601                 kvm_queue_exception(vcpu, UD_VECTOR);
4602                 return 1;
4603         }
4604
4605         /* CPL=0 must be checked manually. */
4606         if (vmx_get_cpl(vcpu)) {
4607                 kvm_inject_gp(vcpu, 0);
4608                 return 1;
4609         }
4610
4611         if (vmx->nested.vmxon)
4612                 return nested_vmx_failValid(vcpu,
4613                         VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
4614
4615         if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
4616                         != VMXON_NEEDED_FEATURES) {
4617                 kvm_inject_gp(vcpu, 0);
4618                 return 1;
4619         }
4620
4621         if (nested_vmx_get_vmptr(vcpu, &vmptr))
4622                 return 1;
4623
4624         /*
4625          * SDM 3: 24.11.5
4626          * The first 4 bytes of VMXON region contain the supported
4627          * VMCS revision identifier
4628          *
4629          * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
4630          * which replaces physical address width with 32
4631          */
4632         if (!page_address_valid(vcpu, vmptr))
4633                 return nested_vmx_failInvalid(vcpu);
4634
4635         if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
4636             revision != VMCS12_REVISION)
4637                 return nested_vmx_failInvalid(vcpu);
4638
4639         vmx->nested.vmxon_ptr = vmptr;
4640         ret = enter_vmx_operation(vcpu);
4641         if (ret)
4642                 return ret;
4643
4644         return nested_vmx_succeed(vcpu);
4645 }
4646
4647 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
4648 {
4649         struct vcpu_vmx *vmx = to_vmx(vcpu);
4650
4651         if (vmx->nested.current_vmptr == -1ull)
4652                 return;
4653
4654         copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
4655
4656         if (enable_shadow_vmcs) {
4657                 /* copy to memory all shadowed fields in case
4658                    they were modified */
4659                 copy_shadow_to_vmcs12(vmx);
4660                 vmx_disable_shadow_vmcs(vmx);
4661         }
4662         vmx->nested.posted_intr_nv = -1;
4663
4664         /* Flush VMCS12 to guest memory */
4665         kvm_vcpu_write_guest_page(vcpu,
4666                                   vmx->nested.current_vmptr >> PAGE_SHIFT,
4667                                   vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
4668
4669         kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
4670
4671         vmx->nested.current_vmptr = -1ull;
4672 }
4673
4674 /* Emulate the VMXOFF instruction */
4675 static int handle_vmoff(struct kvm_vcpu *vcpu)
4676 {
4677         if (!nested_vmx_check_permission(vcpu))
4678                 return 1;
4679
4680         free_nested(vcpu);
4681
4682         /* Process a latched INIT during time CPU was in VMX operation */
4683         kvm_make_request(KVM_REQ_EVENT, vcpu);
4684
4685         return nested_vmx_succeed(vcpu);
4686 }
4687
4688 /* Emulate the VMCLEAR instruction */
4689 static int handle_vmclear(struct kvm_vcpu *vcpu)
4690 {
4691         struct vcpu_vmx *vmx = to_vmx(vcpu);
4692         u32 zero = 0;
4693         gpa_t vmptr;
4694         u64 evmcs_gpa;
4695
4696         if (!nested_vmx_check_permission(vcpu))
4697                 return 1;
4698
4699         if (nested_vmx_get_vmptr(vcpu, &vmptr))
4700                 return 1;
4701
4702         if (!page_address_valid(vcpu, vmptr))
4703                 return nested_vmx_failValid(vcpu,
4704                         VMXERR_VMCLEAR_INVALID_ADDRESS);
4705
4706         if (vmptr == vmx->nested.vmxon_ptr)
4707                 return nested_vmx_failValid(vcpu,
4708                         VMXERR_VMCLEAR_VMXON_POINTER);
4709
4710         /*
4711          * When Enlightened VMEntry is enabled on the calling CPU we treat
4712          * memory area pointer by vmptr as Enlightened VMCS (as there's no good
4713          * way to distinguish it from VMCS12) and we must not corrupt it by
4714          * writing to the non-existent 'launch_state' field. The area doesn't
4715          * have to be the currently active EVMCS on the calling CPU and there's
4716          * nothing KVM has to do to transition it from 'active' to 'non-active'
4717          * state. It is possible that the area will stay mapped as
4718          * vmx->nested.hv_evmcs but this shouldn't be a problem.
4719          */
4720         if (likely(!vmx->nested.enlightened_vmcs_enabled ||
4721                    !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
4722                 if (vmptr == vmx->nested.current_vmptr)
4723                         nested_release_vmcs12(vcpu);
4724
4725                 kvm_vcpu_write_guest(vcpu,
4726                                      vmptr + offsetof(struct vmcs12,
4727                                                       launch_state),
4728                                      &zero, sizeof(zero));
4729         }
4730
4731         return nested_vmx_succeed(vcpu);
4732 }
4733
4734 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
4735
4736 /* Emulate the VMLAUNCH instruction */
4737 static int handle_vmlaunch(struct kvm_vcpu *vcpu)
4738 {
4739         return nested_vmx_run(vcpu, true);
4740 }
4741
4742 /* Emulate the VMRESUME instruction */
4743 static int handle_vmresume(struct kvm_vcpu *vcpu)
4744 {
4745
4746         return nested_vmx_run(vcpu, false);
4747 }
4748
4749 static int handle_vmread(struct kvm_vcpu *vcpu)
4750 {
4751         unsigned long field;
4752         u64 field_value;
4753         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4754         u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4755         int len;
4756         gva_t gva = 0;
4757         struct vmcs12 *vmcs12;
4758         struct x86_exception e;
4759         short offset;
4760
4761         if (!nested_vmx_check_permission(vcpu))
4762                 return 1;
4763
4764         if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
4765                 return nested_vmx_failInvalid(vcpu);
4766
4767         if (!is_guest_mode(vcpu))
4768                 vmcs12 = get_vmcs12(vcpu);
4769         else {
4770                 /*
4771                  * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
4772                  * to shadowed-field sets the ALU flags for VMfailInvalid.
4773                  */
4774                 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
4775                         return nested_vmx_failInvalid(vcpu);
4776                 vmcs12 = get_shadow_vmcs12(vcpu);
4777         }
4778
4779         /* Decode instruction info and find the field to read */
4780         field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
4781
4782         offset = vmcs_field_to_offset(field);
4783         if (offset < 0)
4784                 return nested_vmx_failValid(vcpu,
4785                         VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4786
4787         if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
4788                 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4789
4790         /* Read the field, zero-extended to a u64 field_value */
4791         field_value = vmcs12_read_any(vmcs12, field, offset);
4792
4793         /*
4794          * Now copy part of this value to register or memory, as requested.
4795          * Note that the number of bits actually copied is 32 or 64 depending
4796          * on the guest's mode (32 or 64 bit), not on the given field's length.
4797          */
4798         if (vmx_instruction_info & (1u << 10)) {
4799                 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
4800                         field_value);
4801         } else {
4802                 len = is_64_bit_mode(vcpu) ? 8 : 4;
4803                 if (get_vmx_mem_address(vcpu, exit_qualification,
4804                                 vmx_instruction_info, true, len, &gva))
4805                         return 1;
4806                 /* _system ok, nested_vmx_check_permission has verified cpl=0 */
4807                 if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e))
4808                         kvm_inject_page_fault(vcpu, &e);
4809         }
4810
4811         return nested_vmx_succeed(vcpu);
4812 }
4813
4814 static bool is_shadow_field_rw(unsigned long field)
4815 {
4816         switch (field) {
4817 #define SHADOW_FIELD_RW(x, y) case x:
4818 #include "vmcs_shadow_fields.h"
4819                 return true;
4820         default:
4821                 break;
4822         }
4823         return false;
4824 }
4825
4826 static bool is_shadow_field_ro(unsigned long field)
4827 {
4828         switch (field) {
4829 #define SHADOW_FIELD_RO(x, y) case x:
4830 #include "vmcs_shadow_fields.h"
4831                 return true;
4832         default:
4833                 break;
4834         }
4835         return false;
4836 }
4837
4838 static int handle_vmwrite(struct kvm_vcpu *vcpu)
4839 {
4840         unsigned long field;
4841         int len;
4842         gva_t gva;
4843         struct vcpu_vmx *vmx = to_vmx(vcpu);
4844         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4845         u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4846
4847         /* The value to write might be 32 or 64 bits, depending on L1's long
4848          * mode, and eventually we need to write that into a field of several
4849          * possible lengths. The code below first zero-extends the value to 64
4850          * bit (field_value), and then copies only the appropriate number of
4851          * bits into the vmcs12 field.
4852          */
4853         u64 field_value = 0;
4854         struct x86_exception e;
4855         struct vmcs12 *vmcs12;
4856         short offset;
4857
4858         if (!nested_vmx_check_permission(vcpu))
4859                 return 1;
4860
4861         if (vmx->nested.current_vmptr == -1ull)
4862                 return nested_vmx_failInvalid(vcpu);
4863
4864         if (vmx_instruction_info & (1u << 10))
4865                 field_value = kvm_register_readl(vcpu,
4866                         (((vmx_instruction_info) >> 3) & 0xf));
4867         else {
4868                 len = is_64_bit_mode(vcpu) ? 8 : 4;
4869                 if (get_vmx_mem_address(vcpu, exit_qualification,
4870                                 vmx_instruction_info, false, len, &gva))
4871                         return 1;
4872                 if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) {
4873                         kvm_inject_page_fault(vcpu, &e);
4874                         return 1;
4875                 }
4876         }
4877
4878
4879         field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
4880         /*
4881          * If the vCPU supports "VMWRITE to any supported field in the
4882          * VMCS," then the "read-only" fields are actually read/write.
4883          */
4884         if (vmcs_field_readonly(field) &&
4885             !nested_cpu_has_vmwrite_any_field(vcpu))
4886                 return nested_vmx_failValid(vcpu,
4887                         VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
4888
4889         if (!is_guest_mode(vcpu)) {
4890                 vmcs12 = get_vmcs12(vcpu);
4891
4892                 /*
4893                  * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
4894                  * vmcs12, else we may crush a field or consume a stale value.
4895                  */
4896                 if (!is_shadow_field_rw(field))
4897                         copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4898         } else {
4899                 /*
4900                  * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
4901                  * to shadowed-field sets the ALU flags for VMfailInvalid.
4902                  */
4903                 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
4904                         return nested_vmx_failInvalid(vcpu);
4905                 vmcs12 = get_shadow_vmcs12(vcpu);
4906         }
4907
4908         offset = vmcs_field_to_offset(field);
4909         if (offset < 0)
4910                 return nested_vmx_failValid(vcpu,
4911                         VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4912
4913         /*
4914          * Some Intel CPUs intentionally drop the reserved bits of the AR byte
4915          * fields on VMWRITE.  Emulate this behavior to ensure consistent KVM
4916          * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
4917          * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
4918          * from L1 will return a different value than VMREAD from L2 (L1 sees
4919          * the stripped down value, L2 sees the full value as stored by KVM).
4920          */
4921         if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
4922                 field_value &= 0x1f0ff;
4923
4924         vmcs12_write_any(vmcs12, field, offset, field_value);
4925
4926         /*
4927          * Do not track vmcs12 dirty-state if in guest-mode as we actually
4928          * dirty shadow vmcs12 instead of vmcs12.  Fields that can be updated
4929          * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
4930          * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
4931          */
4932         if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
4933                 /*
4934                  * L1 can read these fields without exiting, ensure the
4935                  * shadow VMCS is up-to-date.
4936                  */
4937                 if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
4938                         preempt_disable();
4939                         vmcs_load(vmx->vmcs01.shadow_vmcs);
4940
4941                         __vmcs_writel(field, field_value);
4942
4943                         vmcs_clear(vmx->vmcs01.shadow_vmcs);
4944                         vmcs_load(vmx->loaded_vmcs->vmcs);
4945                         preempt_enable();
4946                 }
4947                 vmx->nested.dirty_vmcs12 = true;
4948         }
4949
4950         return nested_vmx_succeed(vcpu);
4951 }
4952
4953 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
4954 {
4955         vmx->nested.current_vmptr = vmptr;
4956         if (enable_shadow_vmcs) {
4957                 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
4958                 vmcs_write64(VMCS_LINK_POINTER,
4959                              __pa(vmx->vmcs01.shadow_vmcs));
4960                 vmx->nested.need_vmcs12_to_shadow_sync = true;
4961         }
4962         vmx->nested.dirty_vmcs12 = true;
4963 }
4964
4965 /* Emulate the VMPTRLD instruction */
4966 static int handle_vmptrld(struct kvm_vcpu *vcpu)
4967 {
4968         struct vcpu_vmx *vmx = to_vmx(vcpu);
4969         gpa_t vmptr;
4970
4971         if (!nested_vmx_check_permission(vcpu))
4972                 return 1;
4973
4974         if (nested_vmx_get_vmptr(vcpu, &vmptr))
4975                 return 1;
4976
4977         if (!page_address_valid(vcpu, vmptr))
4978                 return nested_vmx_failValid(vcpu,
4979                         VMXERR_VMPTRLD_INVALID_ADDRESS);
4980
4981         if (vmptr == vmx->nested.vmxon_ptr)
4982                 return nested_vmx_failValid(vcpu,
4983                         VMXERR_VMPTRLD_VMXON_POINTER);
4984
4985         /* Forbid normal VMPTRLD if Enlightened version was used */
4986         if (vmx->nested.hv_evmcs)
4987                 return 1;
4988
4989         if (vmx->nested.current_vmptr != vmptr) {
4990                 struct kvm_host_map map;
4991                 struct vmcs12 *new_vmcs12;
4992
4993                 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
4994                         /*
4995                          * Reads from an unbacked page return all 1s,
4996                          * which means that the 32 bits located at the
4997                          * given physical address won't match the required
4998                          * VMCS12_REVISION identifier.
4999                          */
5000                         return nested_vmx_failValid(vcpu,
5001                                 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5002                 }
5003
5004                 new_vmcs12 = map.hva;
5005
5006                 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
5007                     (new_vmcs12->hdr.shadow_vmcs &&
5008                      !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
5009                         kvm_vcpu_unmap(vcpu, &map, false);
5010                         return nested_vmx_failValid(vcpu,
5011                                 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5012                 }
5013
5014                 nested_release_vmcs12(vcpu);
5015
5016                 /*
5017                  * Load VMCS12 from guest memory since it is not already
5018                  * cached.
5019                  */
5020                 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
5021                 kvm_vcpu_unmap(vcpu, &map, false);
5022
5023                 set_current_vmptr(vmx, vmptr);
5024         }
5025
5026         return nested_vmx_succeed(vcpu);
5027 }
5028
5029 /* Emulate the VMPTRST instruction */
5030 static int handle_vmptrst(struct kvm_vcpu *vcpu)
5031 {
5032         unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
5033         u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5034         gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
5035         struct x86_exception e;
5036         gva_t gva;
5037
5038         if (!nested_vmx_check_permission(vcpu))
5039                 return 1;
5040
5041         if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
5042                 return 1;
5043
5044         if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
5045                                 true, sizeof(gpa_t), &gva))
5046                 return 1;
5047         /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
5048         if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
5049                                         sizeof(gpa_t), &e)) {
5050                 kvm_inject_page_fault(vcpu, &e);
5051                 return 1;
5052         }
5053         return nested_vmx_succeed(vcpu);
5054 }
5055
5056 /* Emulate the INVEPT instruction */
5057 static int handle_invept(struct kvm_vcpu *vcpu)
5058 {
5059         struct vcpu_vmx *vmx = to_vmx(vcpu);
5060         u32 vmx_instruction_info, types;
5061         unsigned long type;
5062         gva_t gva;
5063         struct x86_exception e;
5064         struct {
5065                 u64 eptp, gpa;
5066         } operand;
5067
5068         if (!(vmx->nested.msrs.secondary_ctls_high &
5069               SECONDARY_EXEC_ENABLE_EPT) ||
5070             !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
5071                 kvm_queue_exception(vcpu, UD_VECTOR);
5072                 return 1;
5073         }
5074
5075         if (!nested_vmx_check_permission(vcpu))
5076                 return 1;
5077
5078         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5079         type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
5080
5081         types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
5082
5083         if (type >= 32 || !(types & (1 << type)))
5084                 return nested_vmx_failValid(vcpu,
5085                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5086
5087         /* According to the Intel VMX instruction reference, the memory
5088          * operand is read even if it isn't needed (e.g., for type==global)
5089          */
5090         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5091                         vmx_instruction_info, false, sizeof(operand), &gva))
5092                 return 1;
5093         if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
5094                 kvm_inject_page_fault(vcpu, &e);
5095                 return 1;
5096         }
5097
5098         switch (type) {
5099         case VMX_EPT_EXTENT_GLOBAL:
5100         case VMX_EPT_EXTENT_CONTEXT:
5101         /*
5102          * TODO: Sync the necessary shadow EPT roots here, rather than
5103          * at the next emulated VM-entry.
5104          */
5105                 break;
5106         default:
5107                 BUG_ON(1);
5108                 break;
5109         }
5110
5111         return nested_vmx_succeed(vcpu);
5112 }
5113
5114 static int handle_invvpid(struct kvm_vcpu *vcpu)
5115 {
5116         struct vcpu_vmx *vmx = to_vmx(vcpu);
5117         u32 vmx_instruction_info;
5118         unsigned long type, types;
5119         gva_t gva;
5120         struct x86_exception e;
5121         struct {
5122                 u64 vpid;
5123                 u64 gla;
5124         } operand;
5125         u16 vpid02;
5126
5127         if (!(vmx->nested.msrs.secondary_ctls_high &
5128               SECONDARY_EXEC_ENABLE_VPID) ||
5129                         !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
5130                 kvm_queue_exception(vcpu, UD_VECTOR);
5131                 return 1;
5132         }
5133
5134         if (!nested_vmx_check_permission(vcpu))
5135                 return 1;
5136
5137         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5138         type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
5139
5140         types = (vmx->nested.msrs.vpid_caps &
5141                         VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
5142
5143         if (type >= 32 || !(types & (1 << type)))
5144                 return nested_vmx_failValid(vcpu,
5145                         VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5146
5147         /* according to the intel vmx instruction reference, the memory
5148          * operand is read even if it isn't needed (e.g., for type==global)
5149          */
5150         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5151                         vmx_instruction_info, false, sizeof(operand), &gva))
5152                 return 1;
5153         if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
5154                 kvm_inject_page_fault(vcpu, &e);
5155                 return 1;
5156         }
5157         if (operand.vpid >> 16)
5158                 return nested_vmx_failValid(vcpu,
5159                         VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5160
5161         vpid02 = nested_get_vpid02(vcpu);
5162         switch (type) {
5163         case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
5164                 if (!operand.vpid ||
5165                     is_noncanonical_address(operand.gla, vcpu))
5166                         return nested_vmx_failValid(vcpu,
5167                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5168                 if (cpu_has_vmx_invvpid_individual_addr()) {
5169                         __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
5170                                 vpid02, operand.gla);
5171                 } else
5172                         __vmx_flush_tlb(vcpu, vpid02, false);
5173                 break;
5174         case VMX_VPID_EXTENT_SINGLE_CONTEXT:
5175         case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
5176                 if (!operand.vpid)
5177                         return nested_vmx_failValid(vcpu,
5178                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5179                 __vmx_flush_tlb(vcpu, vpid02, false);
5180                 break;
5181         case VMX_VPID_EXTENT_ALL_CONTEXT:
5182                 __vmx_flush_tlb(vcpu, vpid02, false);
5183                 break;
5184         default:
5185                 WARN_ON_ONCE(1);
5186                 return kvm_skip_emulated_instruction(vcpu);
5187         }
5188
5189         return nested_vmx_succeed(vcpu);
5190 }
5191
5192 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
5193                                      struct vmcs12 *vmcs12)
5194 {
5195         u32 index = kvm_rcx_read(vcpu);
5196         u64 address;
5197         bool accessed_dirty;
5198         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
5199
5200         if (!nested_cpu_has_eptp_switching(vmcs12) ||
5201             !nested_cpu_has_ept(vmcs12))
5202                 return 1;
5203
5204         if (index >= VMFUNC_EPTP_ENTRIES)
5205                 return 1;
5206
5207
5208         if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
5209                                      &address, index * 8, 8))
5210                 return 1;
5211
5212         accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
5213
5214         /*
5215          * If the (L2) guest does a vmfunc to the currently
5216          * active ept pointer, we don't have to do anything else
5217          */
5218         if (vmcs12->ept_pointer != address) {
5219                 if (!valid_ept_address(vcpu, address))
5220                         return 1;
5221
5222                 kvm_mmu_unload(vcpu);
5223                 mmu->ept_ad = accessed_dirty;
5224                 mmu->mmu_role.base.ad_disabled = !accessed_dirty;
5225                 vmcs12->ept_pointer = address;
5226                 /*
5227                  * TODO: Check what's the correct approach in case
5228                  * mmu reload fails. Currently, we just let the next
5229                  * reload potentially fail
5230                  */
5231                 kvm_mmu_reload(vcpu);
5232         }
5233
5234         return 0;
5235 }
5236
5237 static int handle_vmfunc(struct kvm_vcpu *vcpu)
5238 {
5239         struct vcpu_vmx *vmx = to_vmx(vcpu);
5240         struct vmcs12 *vmcs12;
5241         u32 function = kvm_rax_read(vcpu);
5242
5243         /*
5244          * VMFUNC is only supported for nested guests, but we always enable the
5245          * secondary control for simplicity; for non-nested mode, fake that we
5246          * didn't by injecting #UD.
5247          */
5248         if (!is_guest_mode(vcpu)) {
5249                 kvm_queue_exception(vcpu, UD_VECTOR);
5250                 return 1;
5251         }
5252
5253         vmcs12 = get_vmcs12(vcpu);
5254         if ((vmcs12->vm_function_control & (1 << function)) == 0)
5255                 goto fail;
5256
5257         switch (function) {
5258         case 0:
5259                 if (nested_vmx_eptp_switching(vcpu, vmcs12))
5260                         goto fail;
5261                 break;
5262         default:
5263                 goto fail;
5264         }
5265         return kvm_skip_emulated_instruction(vcpu);
5266
5267 fail:
5268         nested_vmx_vmexit(vcpu, vmx->exit_reason,
5269                           vmcs_read32(VM_EXIT_INTR_INFO),
5270                           vmcs_readl(EXIT_QUALIFICATION));
5271         return 1;
5272 }
5273
5274
5275 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
5276                                        struct vmcs12 *vmcs12)
5277 {
5278         unsigned long exit_qualification;
5279         gpa_t bitmap, last_bitmap;
5280         unsigned int port;
5281         int size;
5282         u8 b;
5283
5284         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
5285                 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
5286
5287         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5288
5289         port = exit_qualification >> 16;
5290         size = (exit_qualification & 7) + 1;
5291
5292         last_bitmap = (gpa_t)-1;
5293         b = -1;
5294
5295         while (size > 0) {
5296                 if (port < 0x8000)
5297                         bitmap = vmcs12->io_bitmap_a;
5298                 else if (port < 0x10000)
5299                         bitmap = vmcs12->io_bitmap_b;
5300                 else
5301                         return true;
5302                 bitmap += (port & 0x7fff) / 8;
5303
5304                 if (last_bitmap != bitmap)
5305                         if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
5306                                 return true;
5307                 if (b & (1 << (port & 7)))
5308                         return true;
5309
5310                 port++;
5311                 size--;
5312                 last_bitmap = bitmap;
5313         }
5314
5315         return false;
5316 }
5317
5318 /*
5319  * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
5320  * rather than handle it ourselves in L0. I.e., check whether L1 expressed
5321  * disinterest in the current event (read or write a specific MSR) by using an
5322  * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
5323  */
5324 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5325         struct vmcs12 *vmcs12, u32 exit_reason)
5326 {
5327         u32 msr_index = kvm_rcx_read(vcpu);
5328         gpa_t bitmap;
5329
5330         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
5331                 return true;
5332
5333         /*
5334          * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
5335          * for the four combinations of read/write and low/high MSR numbers.
5336          * First we need to figure out which of the four to use:
5337          */
5338         bitmap = vmcs12->msr_bitmap;
5339         if (exit_reason == EXIT_REASON_MSR_WRITE)
5340                 bitmap += 2048;
5341         if (msr_index >= 0xc0000000) {
5342                 msr_index -= 0xc0000000;
5343                 bitmap += 1024;
5344         }
5345
5346         /* Then read the msr_index'th bit from this bitmap: */
5347         if (msr_index < 1024*8) {
5348                 unsigned char b;
5349                 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
5350                         return true;
5351                 return 1 & (b >> (msr_index & 7));
5352         } else
5353                 return true; /* let L1 handle the wrong parameter */
5354 }
5355
5356 /*
5357  * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
5358  * rather than handle it ourselves in L0. I.e., check if L1 wanted to
5359  * intercept (via guest_host_mask etc.) the current event.
5360  */
5361 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
5362         struct vmcs12 *vmcs12)
5363 {
5364         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5365         int cr = exit_qualification & 15;
5366         int reg;
5367         unsigned long val;
5368
5369         switch ((exit_qualification >> 4) & 3) {
5370         case 0: /* mov to cr */
5371                 reg = (exit_qualification >> 8) & 15;
5372                 val = kvm_register_readl(vcpu, reg);
5373                 switch (cr) {
5374                 case 0:
5375                         if (vmcs12->cr0_guest_host_mask &
5376                             (val ^ vmcs12->cr0_read_shadow))
5377                                 return true;
5378                         break;
5379                 case 3:
5380                         if ((vmcs12->cr3_target_count >= 1 &&
5381                                         vmcs12->cr3_target_value0 == val) ||
5382                                 (vmcs12->cr3_target_count >= 2 &&
5383                                         vmcs12->cr3_target_value1 == val) ||
5384                                 (vmcs12->cr3_target_count >= 3 &&
5385                                         vmcs12->cr3_target_value2 == val) ||
5386                                 (vmcs12->cr3_target_count >= 4 &&
5387                                         vmcs12->cr3_target_value3 == val))
5388                                 return false;
5389                         if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
5390                                 return true;
5391                         break;
5392                 case 4:
5393                         if (vmcs12->cr4_guest_host_mask &
5394                             (vmcs12->cr4_read_shadow ^ val))
5395                                 return true;
5396                         break;
5397                 case 8:
5398                         if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
5399                                 return true;
5400                         break;
5401                 }
5402                 break;
5403         case 2: /* clts */
5404                 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
5405                     (vmcs12->cr0_read_shadow & X86_CR0_TS))
5406                         return true;
5407                 break;
5408         case 1: /* mov from cr */
5409                 switch (cr) {
5410                 case 3:
5411                         if (vmcs12->cpu_based_vm_exec_control &
5412                             CPU_BASED_CR3_STORE_EXITING)
5413                                 return true;
5414                         break;
5415                 case 8:
5416                         if (vmcs12->cpu_based_vm_exec_control &
5417                             CPU_BASED_CR8_STORE_EXITING)
5418                                 return true;
5419                         break;
5420                 }
5421                 break;
5422         case 3: /* lmsw */
5423                 /*
5424                  * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5425                  * cr0. Other attempted changes are ignored, with no exit.
5426                  */
5427                 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5428                 if (vmcs12->cr0_guest_host_mask & 0xe &
5429                     (val ^ vmcs12->cr0_read_shadow))
5430                         return true;
5431                 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
5432                     !(vmcs12->cr0_read_shadow & 0x1) &&
5433                     (val & 0x1))
5434                         return true;
5435                 break;
5436         }
5437         return false;
5438 }
5439
5440 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
5441         struct vmcs12 *vmcs12, gpa_t bitmap)
5442 {
5443         u32 vmx_instruction_info;
5444         unsigned long field;
5445         u8 b;
5446
5447         if (!nested_cpu_has_shadow_vmcs(vmcs12))
5448                 return true;
5449
5450         /* Decode instruction info and find the field to access */
5451         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5452         field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5453
5454         /* Out-of-range fields always cause a VM exit from L2 to L1 */
5455         if (field >> 15)
5456                 return true;
5457
5458         if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
5459                 return true;
5460
5461         return 1 & (b >> (field & 7));
5462 }
5463
5464 /*
5465  * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
5466  * should handle it ourselves in L0 (and then continue L2). Only call this
5467  * when in is_guest_mode (L2).
5468  */
5469 bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
5470 {
5471         u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
5472         struct vcpu_vmx *vmx = to_vmx(vcpu);
5473         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5474
5475         if (vmx->nested.nested_run_pending)
5476                 return false;
5477
5478         if (unlikely(vmx->fail)) {
5479                 trace_kvm_nested_vmenter_failed(
5480                         "hardware VM-instruction error: ",
5481                         vmcs_read32(VM_INSTRUCTION_ERROR));
5482                 return true;
5483         }
5484
5485         /*
5486          * The host physical addresses of some pages of guest memory
5487          * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
5488          * Page). The CPU may write to these pages via their host
5489          * physical address while L2 is running, bypassing any
5490          * address-translation-based dirty tracking (e.g. EPT write
5491          * protection).
5492          *
5493          * Mark them dirty on every exit from L2 to prevent them from
5494          * getting out of sync with dirty tracking.
5495          */
5496         nested_mark_vmcs12_pages_dirty(vcpu);
5497
5498         trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
5499                                 vmcs_readl(EXIT_QUALIFICATION),
5500                                 vmx->idt_vectoring_info,
5501                                 intr_info,
5502                                 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
5503                                 KVM_ISA_VMX);
5504
5505         switch (exit_reason) {
5506         case EXIT_REASON_EXCEPTION_NMI:
5507                 if (is_nmi(intr_info))
5508                         return false;
5509                 else if (is_page_fault(intr_info))
5510                         return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
5511                 else if (is_debug(intr_info) &&
5512                          vcpu->guest_debug &
5513                          (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
5514                         return false;
5515                 else if (is_breakpoint(intr_info) &&
5516                          vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5517                         return false;
5518                 return vmcs12->exception_bitmap &
5519                                 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
5520         case EXIT_REASON_EXTERNAL_INTERRUPT:
5521                 return false;
5522         case EXIT_REASON_TRIPLE_FAULT:
5523                 return true;
5524         case EXIT_REASON_PENDING_INTERRUPT:
5525                 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
5526         case EXIT_REASON_NMI_WINDOW:
5527                 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
5528         case EXIT_REASON_TASK_SWITCH:
5529                 return true;
5530         case EXIT_REASON_CPUID:
5531                 return true;
5532         case EXIT_REASON_HLT:
5533                 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
5534         case EXIT_REASON_INVD:
5535                 return true;
5536         case EXIT_REASON_INVLPG:
5537                 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5538         case EXIT_REASON_RDPMC:
5539                 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
5540         case EXIT_REASON_RDRAND:
5541                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
5542         case EXIT_REASON_RDSEED:
5543                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
5544         case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
5545                 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
5546         case EXIT_REASON_VMREAD:
5547                 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5548                         vmcs12->vmread_bitmap);
5549         case EXIT_REASON_VMWRITE:
5550                 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5551                         vmcs12->vmwrite_bitmap);
5552         case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
5553         case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
5554         case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
5555         case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
5556         case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
5557                 /*
5558                  * VMX instructions trap unconditionally. This allows L1 to
5559                  * emulate them for its L2 guest, i.e., allows 3-level nesting!
5560                  */
5561                 return true;
5562         case EXIT_REASON_CR_ACCESS:
5563                 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
5564         case EXIT_REASON_DR_ACCESS:
5565                 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
5566         case EXIT_REASON_IO_INSTRUCTION:
5567                 return nested_vmx_exit_handled_io(vcpu, vmcs12);
5568         case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
5569                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
5570         case EXIT_REASON_MSR_READ:
5571         case EXIT_REASON_MSR_WRITE:
5572                 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
5573         case EXIT_REASON_INVALID_STATE:
5574                 return true;
5575         case EXIT_REASON_MWAIT_INSTRUCTION:
5576                 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5577         case EXIT_REASON_MONITOR_TRAP_FLAG:
5578                 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
5579         case EXIT_REASON_MONITOR_INSTRUCTION:
5580                 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
5581         case EXIT_REASON_PAUSE_INSTRUCTION:
5582                 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
5583                         nested_cpu_has2(vmcs12,
5584                                 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
5585         case EXIT_REASON_MCE_DURING_VMENTRY:
5586                 return false;
5587         case EXIT_REASON_TPR_BELOW_THRESHOLD:
5588                 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
5589         case EXIT_REASON_APIC_ACCESS:
5590         case EXIT_REASON_APIC_WRITE:
5591         case EXIT_REASON_EOI_INDUCED:
5592                 /*
5593                  * The controls for "virtualize APIC accesses," "APIC-
5594                  * register virtualization," and "virtual-interrupt
5595                  * delivery" only come from vmcs12.
5596                  */
5597                 return true;
5598         case EXIT_REASON_EPT_VIOLATION:
5599                 /*
5600                  * L0 always deals with the EPT violation. If nested EPT is
5601                  * used, and the nested mmu code discovers that the address is
5602                  * missing in the guest EPT table (EPT12), the EPT violation
5603                  * will be injected with nested_ept_inject_page_fault()
5604                  */
5605                 return false;
5606         case EXIT_REASON_EPT_MISCONFIG:
5607                 /*
5608                  * L2 never uses directly L1's EPT, but rather L0's own EPT
5609                  * table (shadow on EPT) or a merged EPT table that L0 built
5610                  * (EPT on EPT). So any problems with the structure of the
5611                  * table is L0's fault.
5612                  */
5613                 return false;
5614         case EXIT_REASON_INVPCID:
5615                 return
5616                         nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
5617                         nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5618         case EXIT_REASON_WBINVD:
5619                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
5620         case EXIT_REASON_XSETBV:
5621                 return true;
5622         case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
5623                 /*
5624                  * This should never happen, since it is not possible to
5625                  * set XSS to a non-zero value---neither in L1 nor in L2.
5626                  * If if it were, XSS would have to be checked against
5627                  * the XSS exit bitmap in vmcs12.
5628                  */
5629                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
5630         case EXIT_REASON_PREEMPTION_TIMER:
5631                 return false;
5632         case EXIT_REASON_PML_FULL:
5633                 /* We emulate PML support to L1. */
5634                 return false;
5635         case EXIT_REASON_VMFUNC:
5636                 /* VM functions are emulated through L2->L0 vmexits. */
5637                 return false;
5638         case EXIT_REASON_ENCLS:
5639                 /* SGX is never exposed to L1 */
5640                 return false;
5641         case EXIT_REASON_UMWAIT:
5642         case EXIT_REASON_TPAUSE:
5643                 return nested_cpu_has2(vmcs12,
5644                         SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
5645         default:
5646                 return true;
5647         }
5648 }
5649
5650
5651 static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
5652                                 struct kvm_nested_state __user *user_kvm_nested_state,
5653                                 u32 user_data_size)
5654 {
5655         struct vcpu_vmx *vmx;
5656         struct vmcs12 *vmcs12;
5657         struct kvm_nested_state kvm_state = {
5658                 .flags = 0,
5659                 .format = KVM_STATE_NESTED_FORMAT_VMX,
5660                 .size = sizeof(kvm_state),
5661                 .hdr.vmx.vmxon_pa = -1ull,
5662                 .hdr.vmx.vmcs12_pa = -1ull,
5663         };
5664         struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
5665                 &user_kvm_nested_state->data.vmx[0];
5666
5667         if (!vcpu)
5668                 return kvm_state.size + sizeof(*user_vmx_nested_state);
5669
5670         vmx = to_vmx(vcpu);
5671         vmcs12 = get_vmcs12(vcpu);
5672
5673         if (nested_vmx_allowed(vcpu) &&
5674             (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
5675                 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
5676                 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
5677
5678                 if (vmx_has_valid_vmcs12(vcpu)) {
5679                         kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
5680
5681                         if (vmx->nested.hv_evmcs)
5682                                 kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
5683
5684                         if (is_guest_mode(vcpu) &&
5685                             nested_cpu_has_shadow_vmcs(vmcs12) &&
5686                             vmcs12->vmcs_link_pointer != -1ull)
5687                                 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
5688                 }
5689
5690                 if (vmx->nested.smm.vmxon)
5691                         kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
5692
5693                 if (vmx->nested.smm.guest_mode)
5694                         kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
5695
5696                 if (is_guest_mode(vcpu)) {
5697                         kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
5698
5699                         if (vmx->nested.nested_run_pending)
5700                                 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
5701                 }
5702         }
5703
5704         if (user_data_size < kvm_state.size)
5705                 goto out;
5706
5707         if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
5708                 return -EFAULT;
5709
5710         if (!vmx_has_valid_vmcs12(vcpu))
5711                 goto out;
5712
5713         /*
5714          * When running L2, the authoritative vmcs12 state is in the
5715          * vmcs02. When running L1, the authoritative vmcs12 state is
5716          * in the shadow or enlightened vmcs linked to vmcs01, unless
5717          * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
5718          * vmcs12 state is in the vmcs12 already.
5719          */
5720         if (is_guest_mode(vcpu)) {
5721                 sync_vmcs02_to_vmcs12(vcpu, vmcs12);
5722                 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5723         } else if (!vmx->nested.need_vmcs12_to_shadow_sync) {
5724                 if (vmx->nested.hv_evmcs)
5725                         copy_enlightened_to_vmcs12(vmx);
5726                 else if (enable_shadow_vmcs)
5727                         copy_shadow_to_vmcs12(vmx);
5728         }
5729
5730         BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
5731         BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
5732
5733         /*
5734          * Copy over the full allocated size of vmcs12 rather than just the size
5735          * of the struct.
5736          */
5737         if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
5738                 return -EFAULT;
5739
5740         if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5741             vmcs12->vmcs_link_pointer != -1ull) {
5742                 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
5743                                  get_shadow_vmcs12(vcpu), VMCS12_SIZE))
5744                         return -EFAULT;
5745         }
5746
5747 out:
5748         return kvm_state.size;
5749 }
5750
5751 /*
5752  * Forcibly leave nested mode in order to be able to reset the VCPU later on.
5753  */
5754 void vmx_leave_nested(struct kvm_vcpu *vcpu)
5755 {
5756         if (is_guest_mode(vcpu)) {
5757                 to_vmx(vcpu)->nested.nested_run_pending = 0;
5758                 nested_vmx_vmexit(vcpu, -1, 0, 0);
5759         }
5760         free_nested(vcpu);
5761 }
5762
5763 static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
5764                                 struct kvm_nested_state __user *user_kvm_nested_state,
5765                                 struct kvm_nested_state *kvm_state)
5766 {
5767         struct vcpu_vmx *vmx = to_vmx(vcpu);
5768         struct vmcs12 *vmcs12;
5769         u32 exit_qual;
5770         struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
5771                 &user_kvm_nested_state->data.vmx[0];
5772         int ret;
5773
5774         if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
5775                 return -EINVAL;
5776
5777         if (kvm_state->hdr.vmx.vmxon_pa == -1ull) {
5778                 if (kvm_state->hdr.vmx.smm.flags)
5779                         return -EINVAL;
5780
5781                 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull)
5782                         return -EINVAL;
5783
5784                 /*
5785                  * KVM_STATE_NESTED_EVMCS used to signal that KVM should
5786                  * enable eVMCS capability on vCPU. However, since then
5787                  * code was changed such that flag signals vmcs12 should
5788                  * be copied into eVMCS in guest memory.
5789                  *
5790                  * To preserve backwards compatability, allow user
5791                  * to set this flag even when there is no VMXON region.
5792                  */
5793                 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
5794                         return -EINVAL;
5795         } else {
5796                 if (!nested_vmx_allowed(vcpu))
5797                         return -EINVAL;
5798
5799                 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
5800                         return -EINVAL;
5801         }
5802
5803         if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
5804             (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5805                 return -EINVAL;
5806
5807         if (kvm_state->hdr.vmx.smm.flags &
5808             ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
5809                 return -EINVAL;
5810
5811         /*
5812          * SMM temporarily disables VMX, so we cannot be in guest mode,
5813          * nor can VMLAUNCH/VMRESUME be pending.  Outside SMM, SMM flags
5814          * must be zero.
5815          */
5816         if (is_smm(vcpu) ?
5817                 (kvm_state->flags &
5818                  (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
5819                 : kvm_state->hdr.vmx.smm.flags)
5820                 return -EINVAL;
5821
5822         if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
5823             !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
5824                 return -EINVAL;
5825
5826         if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
5827                 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled))
5828                         return -EINVAL;
5829
5830         vmx_leave_nested(vcpu);
5831
5832         if (kvm_state->hdr.vmx.vmxon_pa == -1ull)
5833                 return 0;
5834
5835         vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
5836         ret = enter_vmx_operation(vcpu);
5837         if (ret)
5838                 return ret;
5839
5840         /* Empty 'VMXON' state is permitted */
5841         if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
5842                 return 0;
5843
5844         if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) {
5845                 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
5846                     !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
5847                         return -EINVAL;
5848
5849                 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
5850         } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
5851                 /*
5852                  * Sync eVMCS upon entry as we may not have
5853                  * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
5854                  */
5855                 vmx->nested.need_vmcs12_to_shadow_sync = true;
5856         } else {
5857                 return -EINVAL;
5858         }
5859
5860         if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
5861                 vmx->nested.smm.vmxon = true;
5862                 vmx->nested.vmxon = false;
5863
5864                 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
5865                         vmx->nested.smm.guest_mode = true;
5866         }
5867
5868         vmcs12 = get_vmcs12(vcpu);
5869         if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
5870                 return -EFAULT;
5871
5872         if (vmcs12->hdr.revision_id != VMCS12_REVISION)
5873                 return -EINVAL;
5874
5875         if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5876                 return 0;
5877
5878         vmx->nested.nested_run_pending =
5879                 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
5880
5881         ret = -EINVAL;
5882         if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5883             vmcs12->vmcs_link_pointer != -1ull) {
5884                 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
5885
5886                 if (kvm_state->size <
5887                     sizeof(*kvm_state) +
5888                     sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
5889                         goto error_guest_mode;
5890
5891                 if (copy_from_user(shadow_vmcs12,
5892                                    user_vmx_nested_state->shadow_vmcs12,
5893                                    sizeof(*shadow_vmcs12))) {
5894                         ret = -EFAULT;
5895                         goto error_guest_mode;
5896                 }
5897
5898                 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
5899                     !shadow_vmcs12->hdr.shadow_vmcs)
5900                         goto error_guest_mode;
5901         }
5902
5903         if (nested_vmx_check_controls(vcpu, vmcs12) ||
5904             nested_vmx_check_host_state(vcpu, vmcs12) ||
5905             nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
5906                 goto error_guest_mode;
5907
5908         vmx->nested.dirty_vmcs12 = true;
5909         ret = nested_vmx_enter_non_root_mode(vcpu, false);
5910         if (ret)
5911                 goto error_guest_mode;
5912
5913         return 0;
5914
5915 error_guest_mode:
5916         vmx->nested.nested_run_pending = 0;
5917         return ret;
5918 }
5919
5920 void nested_vmx_set_vmcs_shadowing_bitmap(void)
5921 {
5922         if (enable_shadow_vmcs) {
5923                 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
5924                 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
5925         }
5926 }
5927
5928 /*
5929  * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
5930  * returned for the various VMX controls MSRs when nested VMX is enabled.
5931  * The same values should also be used to verify that vmcs12 control fields are
5932  * valid during nested entry from L1 to L2.
5933  * Each of these control msrs has a low and high 32-bit half: A low bit is on
5934  * if the corresponding bit in the (32-bit) control field *must* be on, and a
5935  * bit in the high half is on if the corresponding bit in the control field
5936  * may be on. See also vmx_control_verify().
5937  */
5938 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
5939                                 bool apicv)
5940 {
5941         /*
5942          * Note that as a general rule, the high half of the MSRs (bits in
5943          * the control fields which may be 1) should be initialized by the
5944          * intersection of the underlying hardware's MSR (i.e., features which
5945          * can be supported) and the list of features we want to expose -
5946          * because they are known to be properly supported in our code.
5947          * Also, usually, the low half of the MSRs (bits which must be 1) can
5948          * be set to 0, meaning that L1 may turn off any of these bits. The
5949          * reason is that if one of these bits is necessary, it will appear
5950          * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
5951          * fields of vmcs01 and vmcs02, will turn these bits off - and
5952          * nested_vmx_exit_reflected() will not pass related exits to L1.
5953          * These rules have exceptions below.
5954          */
5955
5956         /* pin-based controls */
5957         rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
5958                 msrs->pinbased_ctls_low,
5959                 msrs->pinbased_ctls_high);
5960         msrs->pinbased_ctls_low |=
5961                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
5962         msrs->pinbased_ctls_high &=
5963                 PIN_BASED_EXT_INTR_MASK |
5964                 PIN_BASED_NMI_EXITING |
5965                 PIN_BASED_VIRTUAL_NMIS |
5966                 (apicv ? PIN_BASED_POSTED_INTR : 0);
5967         msrs->pinbased_ctls_high |=
5968                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
5969                 PIN_BASED_VMX_PREEMPTION_TIMER;
5970
5971         /* exit controls */
5972         rdmsr(MSR_IA32_VMX_EXIT_CTLS,
5973                 msrs->exit_ctls_low,
5974                 msrs->exit_ctls_high);
5975         msrs->exit_ctls_low =
5976                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
5977
5978         msrs->exit_ctls_high &=
5979 #ifdef CONFIG_X86_64
5980                 VM_EXIT_HOST_ADDR_SPACE_SIZE |
5981 #endif
5982                 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
5983         msrs->exit_ctls_high |=
5984                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
5985                 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
5986                 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
5987
5988         /* We support free control of debug control saving. */
5989         msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
5990
5991         /* entry controls */
5992         rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
5993                 msrs->entry_ctls_low,
5994                 msrs->entry_ctls_high);
5995         msrs->entry_ctls_low =
5996                 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
5997         msrs->entry_ctls_high &=
5998 #ifdef CONFIG_X86_64
5999                 VM_ENTRY_IA32E_MODE |
6000 #endif
6001                 VM_ENTRY_LOAD_IA32_PAT;
6002         msrs->entry_ctls_high |=
6003                 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
6004
6005         /* We support free control of debug control loading. */
6006         msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
6007
6008         /* cpu-based controls */
6009         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
6010                 msrs->procbased_ctls_low,
6011                 msrs->procbased_ctls_high);
6012         msrs->procbased_ctls_low =
6013                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6014         msrs->procbased_ctls_high &=
6015                 CPU_BASED_VIRTUAL_INTR_PENDING |
6016                 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
6017                 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
6018                 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
6019                 CPU_BASED_CR3_STORE_EXITING |
6020 #ifdef CONFIG_X86_64
6021                 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
6022 #endif
6023                 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
6024                 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
6025                 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
6026                 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
6027                 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
6028         /*
6029          * We can allow some features even when not supported by the
6030          * hardware. For example, L1 can specify an MSR bitmap - and we
6031          * can use it to avoid exits to L1 - even when L0 runs L2
6032          * without MSR bitmaps.
6033          */
6034         msrs->procbased_ctls_high |=
6035                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
6036                 CPU_BASED_USE_MSR_BITMAPS;
6037
6038         /* We support free control of CR3 access interception. */
6039         msrs->procbased_ctls_low &=
6040                 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
6041
6042         /*
6043          * secondary cpu-based controls.  Do not include those that
6044          * depend on CPUID bits, they are added later by vmx_cpuid_update.
6045          */
6046         if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
6047                 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
6048                       msrs->secondary_ctls_low,
6049                       msrs->secondary_ctls_high);
6050
6051         msrs->secondary_ctls_low = 0;
6052         msrs->secondary_ctls_high &=
6053                 SECONDARY_EXEC_DESC |
6054                 SECONDARY_EXEC_RDTSCP |
6055                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
6056                 SECONDARY_EXEC_WBINVD_EXITING |
6057                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
6058                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
6059                 SECONDARY_EXEC_RDRAND_EXITING |
6060                 SECONDARY_EXEC_ENABLE_INVPCID |
6061                 SECONDARY_EXEC_RDSEED_EXITING |
6062                 SECONDARY_EXEC_XSAVES;
6063
6064         /*
6065          * We can emulate "VMCS shadowing," even if the hardware
6066          * doesn't support it.
6067          */
6068         msrs->secondary_ctls_high |=
6069                 SECONDARY_EXEC_SHADOW_VMCS;
6070
6071         if (enable_ept) {
6072                 /* nested EPT: emulate EPT also to L1 */
6073                 msrs->secondary_ctls_high |=
6074                         SECONDARY_EXEC_ENABLE_EPT;
6075                 msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
6076                          VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
6077                 if (cpu_has_vmx_ept_execute_only())
6078                         msrs->ept_caps |=
6079                                 VMX_EPT_EXECUTE_ONLY_BIT;
6080                 msrs->ept_caps &= ept_caps;
6081                 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
6082                         VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
6083                         VMX_EPT_1GB_PAGE_BIT;
6084                 if (enable_ept_ad_bits) {
6085                         msrs->secondary_ctls_high |=
6086                                 SECONDARY_EXEC_ENABLE_PML;
6087                         msrs->ept_caps |= VMX_EPT_AD_BIT;
6088                 }
6089         }
6090
6091         if (cpu_has_vmx_vmfunc()) {
6092                 msrs->secondary_ctls_high |=
6093                         SECONDARY_EXEC_ENABLE_VMFUNC;
6094                 /*
6095                  * Advertise EPTP switching unconditionally
6096                  * since we emulate it
6097                  */
6098                 if (enable_ept)
6099                         msrs->vmfunc_controls =
6100                                 VMX_VMFUNC_EPTP_SWITCHING;
6101         }
6102
6103         /*
6104          * Old versions of KVM use the single-context version without
6105          * checking for support, so declare that it is supported even
6106          * though it is treated as global context.  The alternative is
6107          * not failing the single-context invvpid, and it is worse.
6108          */
6109         if (enable_vpid) {
6110                 msrs->secondary_ctls_high |=
6111                         SECONDARY_EXEC_ENABLE_VPID;
6112                 msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
6113                         VMX_VPID_EXTENT_SUPPORTED_MASK;
6114         }
6115
6116         if (enable_unrestricted_guest)
6117                 msrs->secondary_ctls_high |=
6118                         SECONDARY_EXEC_UNRESTRICTED_GUEST;
6119
6120         if (flexpriority_enabled)
6121                 msrs->secondary_ctls_high |=
6122                         SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6123
6124         /* miscellaneous data */
6125         rdmsr(MSR_IA32_VMX_MISC,
6126                 msrs->misc_low,
6127                 msrs->misc_high);
6128         msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
6129         msrs->misc_low |=
6130                 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
6131                 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
6132                 VMX_MISC_ACTIVITY_HLT;
6133         msrs->misc_high = 0;
6134
6135         /*
6136          * This MSR reports some information about VMX support. We
6137          * should return information about the VMX we emulate for the
6138          * guest, and the VMCS structure we give it - not about the
6139          * VMX support of the underlying hardware.
6140          */
6141         msrs->basic =
6142                 VMCS12_REVISION |
6143                 VMX_BASIC_TRUE_CTLS |
6144                 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
6145                 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
6146
6147         if (cpu_has_vmx_basic_inout())
6148                 msrs->basic |= VMX_BASIC_INOUT;
6149
6150         /*
6151          * These MSRs specify bits which the guest must keep fixed on
6152          * while L1 is in VMXON mode (in L1's root mode, or running an L2).
6153          * We picked the standard core2 setting.
6154          */
6155 #define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
6156 #define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
6157         msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
6158         msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
6159
6160         /* These MSRs specify bits which the guest must keep fixed off. */
6161         rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
6162         rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
6163
6164         /* highest index: VMX_PREEMPTION_TIMER_VALUE */
6165         msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
6166 }
6167
6168 void nested_vmx_hardware_unsetup(void)
6169 {
6170         int i;
6171
6172         if (enable_shadow_vmcs) {
6173                 for (i = 0; i < VMX_BITMAP_NR; i++)
6174                         free_page((unsigned long)vmx_bitmap[i]);
6175         }
6176 }
6177
6178 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
6179 {
6180         int i;
6181
6182         if (!cpu_has_vmx_shadow_vmcs())
6183                 enable_shadow_vmcs = 0;
6184         if (enable_shadow_vmcs) {
6185                 for (i = 0; i < VMX_BITMAP_NR; i++) {
6186                         /*
6187                          * The vmx_bitmap is not tied to a VM and so should
6188                          * not be charged to a memcg.
6189                          */
6190                         vmx_bitmap[i] = (unsigned long *)
6191                                 __get_free_page(GFP_KERNEL);
6192                         if (!vmx_bitmap[i]) {
6193                                 nested_vmx_hardware_unsetup();
6194                                 return -ENOMEM;
6195                         }
6196                 }
6197
6198                 init_vmcs_shadow_fields();
6199         }
6200
6201         exit_handlers[EXIT_REASON_VMCLEAR]      = handle_vmclear;
6202         exit_handlers[EXIT_REASON_VMLAUNCH]     = handle_vmlaunch;
6203         exit_handlers[EXIT_REASON_VMPTRLD]      = handle_vmptrld;
6204         exit_handlers[EXIT_REASON_VMPTRST]      = handle_vmptrst;
6205         exit_handlers[EXIT_REASON_VMREAD]       = handle_vmread;
6206         exit_handlers[EXIT_REASON_VMRESUME]     = handle_vmresume;
6207         exit_handlers[EXIT_REASON_VMWRITE]      = handle_vmwrite;
6208         exit_handlers[EXIT_REASON_VMOFF]        = handle_vmoff;
6209         exit_handlers[EXIT_REASON_VMON]         = handle_vmon;
6210         exit_handlers[EXIT_REASON_INVEPT]       = handle_invept;
6211         exit_handlers[EXIT_REASON_INVVPID]      = handle_invvpid;
6212         exit_handlers[EXIT_REASON_VMFUNC]       = handle_vmfunc;
6213
6214         kvm_x86_ops->check_nested_events = vmx_check_nested_events;
6215         kvm_x86_ops->get_nested_state = vmx_get_nested_state;
6216         kvm_x86_ops->set_nested_state = vmx_set_nested_state;
6217         kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages;
6218         kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs;
6219         kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version;
6220
6221         return 0;
6222 }