Merge tag 'v5.10-rc1' into regulator-5.10
[linux-2.6-microblaze.git] / arch / x86 / kvm / x86.c
index c4015a4..397f599 100644 (file)
@@ -194,7 +194,7 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs;
 u64 __read_mostly host_efer;
 EXPORT_SYMBOL_GPL(host_efer);
 
-bool __read_mostly allow_smaller_maxphyaddr;
+bool __read_mostly allow_smaller_maxphyaddr = 0;
 EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
 
 static u64 __read_mostly host_xss;
@@ -982,6 +982,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
        unsigned long old_cr4 = kvm_read_cr4(vcpu);
        unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
                                   X86_CR4_SMEP;
+       unsigned long mmu_role_bits = pdptr_bits | X86_CR4_SMAP | X86_CR4_PKE;
 
        if (kvm_valid_cr4(vcpu, cr4))
                return 1;
@@ -1009,7 +1010,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
        if (kvm_x86_ops.set_cr4(vcpu, cr4))
                return 1;
 
-       if (((cr4 ^ old_cr4) & pdptr_bits) ||
+       if (((cr4 ^ old_cr4) & mmu_role_bits) ||
            (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
                kvm_mmu_reset_context(vcpu);
 
@@ -1457,6 +1458,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
        u64 old_efer = vcpu->arch.efer;
        u64 efer = msr_info->data;
+       int r;
 
        if (efer & efer_reserved_bits)
                return 1;
@@ -1473,7 +1475,11 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        efer &= ~EFER_LMA;
        efer |= vcpu->arch.efer & EFER_LMA;
 
-       kvm_x86_ops.set_efer(vcpu, efer);
+       r = kvm_x86_ops.set_efer(vcpu, efer);
+       if (r) {
+               WARN_ON(r > 0);
+               return r;
+       }
 
        /* Update reserved bits */
        if ((efer ^ old_efer) & EFER_NX)
@@ -1497,8 +1503,8 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
        bool r = kvm->arch.msr_filter.default_allow;
        int idx;
 
-       /* MSR filtering not set up, allow everything */
-       if (!count)
+       /* MSR filtering not set up or x2APIC enabled, allow everything */
+       if (!count || (index >= 0x800 && index <= 0x8ff))
                return true;
 
        /* Prevent collision with set_msr_filter */
@@ -1737,13 +1743,16 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
        r = kvm_set_msr(vcpu, ecx, data);
 
        /* MSR write failed? See if we should ask user space */
-       if (r && kvm_set_msr_user_space(vcpu, ecx, data, r)) {
+       if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
                /* Bounce to user space */
                return 0;
-       }
+
+       /* Signal all other negative errors to userspace */
+       if (r < 0)
+               return r;
 
        /* MSR write failed? Inject a #GP */
-       if (r) {
+       if (r > 0) {
                trace_kvm_msr_write_ex(ecx, data);
                kvm_inject_gp(vcpu, 0);
                return 1;
@@ -1916,6 +1925,8 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
        struct pvclock_wall_clock wc;
        u64 wall_nsec;
 
+       kvm->arch.wall_clock = wall_clock;
+
        if (!wall_clock)
                return;
 
@@ -1948,6 +1959,34 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 }
 
+static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
+                                 bool old_msr, bool host_initiated)
+{
+       struct kvm_arch *ka = &vcpu->kvm->arch;
+
+       if (vcpu->vcpu_id == 0 && !host_initiated) {
+               if (ka->boot_vcpu_runs_old_kvmclock && old_msr)
+                       kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+
+               ka->boot_vcpu_runs_old_kvmclock = old_msr;
+       }
+
+       vcpu->arch.time = system_time;
+       kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
+
+       /* we verify if the enable bit is set... */
+       vcpu->arch.pv_time_enabled = false;
+       if (!(system_time & 1))
+               return;
+
+       if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
+                                      &vcpu->arch.pv_time, system_time & ~1ULL,
+                                      sizeof(struct pvclock_vcpu_time_info)))
+               vcpu->arch.pv_time_enabled = true;
+
+       return;
+}
+
 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 {
        do_shl32_div32(dividend, divisor);
@@ -2812,24 +2851,19 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
        u32 page_num = data & ~PAGE_MASK;
        u64 page_addr = data & PAGE_MASK;
        u8 *page;
-       int r;
 
-       r = -E2BIG;
        if (page_num >= blob_size)
-               goto out;
-       r = -ENOMEM;
+               return 1;
+
        page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
-       if (IS_ERR(page)) {
-               r = PTR_ERR(page);
-               goto out;
+       if (IS_ERR(page))
+               return PTR_ERR(page);
+
+       if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
+               kfree(page);
+               return 1;
        }
-       if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
-               goto out_free;
-       r = 0;
-out_free:
-       kfree(page);
-out:
-       return r;
+       return 0;
 }
 
 static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
@@ -2847,6 +2881,14 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
        if (data & 0x30)
                return 1;
 
+       if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
+           (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
+               return 1;
+
+       if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
+           (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
+               return 1;
+
        if (!lapic_in_kernel(vcpu))
                return data ? 1 : 0;
 
@@ -2924,10 +2966,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
         * Doing a TLB flush here, on the guest's behalf, can avoid
         * expensive IPIs.
         */
-       trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
-               st->preempted & KVM_VCPU_FLUSH_TLB);
-       if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
-               kvm_vcpu_flush_tlb_guest(vcpu);
+       if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
+               trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
+                                      st->preempted & KVM_VCPU_FLUSH_TLB);
+               if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
+                       kvm_vcpu_flush_tlb_guest(vcpu);
+       }
 
        vcpu->arch.st.preempted = 0;
 
@@ -3088,53 +3132,54 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                vcpu->arch.smi_count = data;
                break;
        case MSR_KVM_WALL_CLOCK_NEW:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+                       return 1;
+
+               kvm_write_wall_clock(vcpu->kvm, data);
+               break;
        case MSR_KVM_WALL_CLOCK:
-               vcpu->kvm->arch.wall_clock = data;
+               if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+                       return 1;
+
                kvm_write_wall_clock(vcpu->kvm, data);
                break;
        case MSR_KVM_SYSTEM_TIME_NEW:
-       case MSR_KVM_SYSTEM_TIME: {
-               struct kvm_arch *ka = &vcpu->kvm->arch;
-
-               if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
-                       bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
-
-                       if (ka->boot_vcpu_runs_old_kvmclock != tmp)
-                               kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
-
-                       ka->boot_vcpu_runs_old_kvmclock = tmp;
-               }
-
-               vcpu->arch.time = data;
-               kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
-
-               /* we verify if the enable bit is set... */
-               vcpu->arch.pv_time_enabled = false;
-               if (!(data & 1))
-                       break;
+               if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+                       return 1;
 
-               if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
-                    &vcpu->arch.pv_time, data & ~1ULL,
-                    sizeof(struct pvclock_vcpu_time_info)))
-                       vcpu->arch.pv_time_enabled = true;
+               kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
+               break;
+       case MSR_KVM_SYSTEM_TIME:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+                       return 1;
 
+               kvm_write_system_time(vcpu, data, true,  msr_info->host_initiated);
                break;
-       }
        case MSR_KVM_ASYNC_PF_EN:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+                       return 1;
+
                if (kvm_pv_enable_async_pf(vcpu, data))
                        return 1;
                break;
        case MSR_KVM_ASYNC_PF_INT:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+                       return 1;
+
                if (kvm_pv_enable_async_pf_int(vcpu, data))
                        return 1;
                break;
        case MSR_KVM_ASYNC_PF_ACK:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+                       return 1;
                if (data & 0x1) {
                        vcpu->arch.apf.pageready_pending = false;
                        kvm_check_async_pf_completion(vcpu);
                }
                break;
        case MSR_KVM_STEAL_TIME:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
+                       return 1;
 
                if (unlikely(!sched_info_on()))
                        return 1;
@@ -3151,11 +3196,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
                break;
        case MSR_KVM_PV_EOI_EN:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
+                       return 1;
+
                if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
                        return 1;
                break;
 
        case MSR_KVM_POLL_CONTROL:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
+                       return 1;
+
                /* only enable bit supported */
                if (data & (-1ULL << 1))
                        return 1;
@@ -3350,9 +3401,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 * even when not intercepted. AMD manual doesn't explicitly
                 * state this but appears to behave the same.
                 *
-                * Unconditionally return L1's TSC offset on userspace reads
-                * so that userspace reads and writes always operate on L1's
-                * offset, e.g. to ensure deterministic behavior for migration.
+                * On userspace reads and writes, however, we unconditionally
+                * return L1's TSC value to ensure backwards-compatible
+                * behavior for migration.
                 */
                u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
                                                            vcpu->arch.tsc_offset;
@@ -3651,6 +3702,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_LAST_CPU:
        case KVM_CAP_X86_USER_SPACE_MSR:
        case KVM_CAP_X86_MSR_FILTER:
+       case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
                r = 1;
                break;
        case KVM_CAP_SYNC_REGS:
@@ -4521,6 +4573,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 
                return kvm_x86_ops.enable_direct_tlbflush(vcpu);
 
+       case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
+               vcpu->arch.pv_cpuid.enforce = cap->args[0];
+
+               return 0;
+
        default:
                return -EINVAL;
        }
@@ -5252,14 +5309,21 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
        struct kvm_msr_filter filter;
        bool default_allow;
        int r = 0;
+       bool empty = true;
        u32 i;
 
        if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
                return -EFAULT;
 
-       kvm_clear_msr_filter(kvm);
+       for (i = 0; i < ARRAY_SIZE(filter.ranges); i++)
+               empty &= !filter.ranges[i].nmsrs;
 
        default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY);
+       if (empty && !default_allow)
+               return -EINVAL;
+
+       kvm_clear_msr_filter(kvm);
+
        kvm->arch.msr_filter.default_allow = default_allow;
 
        /*
@@ -7986,11 +8050,16 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
                goto out;
        }
 
+       ret = -KVM_ENOSYS;
+
        switch (nr) {
        case KVM_HC_VAPIC_POLL_IRQ:
                ret = 0;
                break;
        case KVM_HC_KICK_CPU:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
+                       break;
+
                kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
                kvm_sched_yield(vcpu->kvm, a1);
                ret = 0;
@@ -8001,9 +8070,15 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
                break;
 #endif
        case KVM_HC_SEND_IPI:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
+                       break;
+
                ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
                break;
        case KVM_HC_SCHED_YIELD:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
+                       break;
+
                kvm_sched_yield(vcpu->kvm, a0);
                ret = 0;
                break;
@@ -9877,6 +9952,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        kvm_mmu_destroy(vcpu);
        srcu_read_unlock(&vcpu->kvm->srcu, idx);
        free_page((unsigned long)vcpu->arch.pio_data);
+       kvfree(vcpu->arch.cpuid_entries);
        if (!lapic_in_kernel(vcpu))
                static_key_slow_dec(&kvm_no_apic_vcpu);
 }