Merge tag 'y2038-timekeeping' of git://git.kernel.org/pub/scm/linux/kernel/git/arnd...
authorThomas Gleixner <tglx@linutronix.de>
Thu, 19 Apr 2018 14:27:44 +0000 (16:27 +0200)
committerThomas Gleixner <tglx@linutronix.de>
Thu, 19 Apr 2018 14:27:44 +0000 (16:27 +0200)
Pull y2038 timekeeping syscall changes from Arnd Bergmann:

This is the first set of system call entry point changes to enable 32-bit
architectures to have variants on both 32-bit and 64-bit time_t. Typically
these system calls take a 'struct timespec' argument, but that structure
is defined in user space by the C library and its layout will change.

The kernel already supports handling the 32-bit time_t on 64-bit
architectures through the CONFIG_COMPAT mechanism. As there are a total
of 51 system calls suffering from this problem, reusing that mechanism
on 32-bit architectures.

We already have patches for most of the remaining system calls, but this
set contains most of the complexity and is best tested.  There was one
last-minute regression that prevented it from going into 4.17, but that
is fixed now.

More details from Deepa's patch series description:

   Big picture is as per the lwn article:
   https://lwn.net/Articles/643234/ [2]

   The series is directed at converting posix clock syscalls:
   clock_gettime, clock_settime, clock_getres and clock_nanosleep
   to use a new data structure __kernel_timespec at syscall boundaries.
   __kernel_timespec maintains 64 bit time_t across all execution modes.

   vdso will be handled as part of each architecture when they enable
   support for 64 bit time_t.

   The compat syscalls are repurposed to provide backward compatibility
   by using them as native syscalls as well for 32 bit architectures.
   They will continue to use timespec at syscall boundaries.

   CONFIG_64_BIT_TIME controls whether the syscalls use __kernel_timespec
   or timespec at syscall boundaries.

   The series does the following:
   1. Enable compat syscalls on 32 bit architectures.
   2. Add a new __kernel_timespec type to be used as the data structure
      for all the new syscalls.
   3. Add new config CONFIG_64BIT_TIME(intead of the CONFIG_COMPAT_TIME in
      [1] and [2] to switch to new definition of __kernel_timespec. It is
      the same as struct timespec otherwise.
   4. Add new CONFIG_32BIT_TIME to conditionally compile compat syscalls.

21 files changed:
Documentation/devicetree/bindings/timer/nuvoton,npcm7xx-timer.txt [new file with mode: 0644]
Documentation/devicetree/bindings/timer/nxp,tpm-timer.txt
arch/parisc/kernel/Makefile
arch/x86/include/asm/kvm_host.h
arch/x86/kernel/ldt.c
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
drivers/block/rbd.c
drivers/clocksource/Kconfig
drivers/clocksource/Makefile
drivers/clocksource/timer-imx-tpm.c
drivers/clocksource/timer-npcm7xx.c [new file with mode: 0644]
fs/ceph/inode.c
tools/testing/selftests/kvm/Makefile
tools/testing/selftests/kvm/include/kvm_util.h
tools/testing/selftests/kvm/include/vmx.h [new file with mode: 0644]
tools/testing/selftests/kvm/lib/kvm_util.c
tools/testing/selftests/kvm/lib/sparsebit.c
tools/testing/selftests/kvm/lib/vmx.c [new file with mode: 0644]
tools/testing/selftests/kvm/vmx_tsc_adjust_test.c [new file with mode: 0644]

diff --git a/Documentation/devicetree/bindings/timer/nuvoton,npcm7xx-timer.txt b/Documentation/devicetree/bindings/timer/nuvoton,npcm7xx-timer.txt
new file mode 100644 (file)
index 0000000..ea22dfe
--- /dev/null
@@ -0,0 +1,21 @@
+Nuvoton NPCM7xx timer
+
+Nuvoton NPCM7xx have three timer modules, each timer module provides five 24-bit
+timer counters.
+
+Required properties:
+- compatible      : "nuvoton,npcm750-timer" for Poleg NPCM750.
+- reg             : Offset and length of the register set for the device.
+- interrupts      : Contain the timer interrupt with flags for
+                    falling edge.
+- clocks          : phandle of timer reference clock (usually a 25 MHz clock).
+
+Example:
+
+timer@f0008000 {
+    compatible = "nuvoton,npcm750-timer";
+    interrupts = <GIC_SPI 32 IRQ_TYPE_LEVEL_HIGH>;
+    reg = <0xf0008000 0x50>;
+    clocks = <&clk NPCM7XX_CLK_TIMER>;
+};
+
index b4aa7dd..f82087b 100644 (file)
@@ -15,7 +15,7 @@ Required properties:
 - interrupts : Should be the clock event device interrupt.
 - clocks :     The clocks provided by the SoC to drive the timer, must contain
                an entry for each entry in clock-names.
-- clock-names : Must include the following entries: "igp" and "per".
+- clock-names : Must include the following entries: "ipg" and "per".
 
 Example:
 tpm5: tpm@40260000 {
index eafd06a..e5de34d 100644 (file)
@@ -23,7 +23,7 @@ obj-$(CONFIG_SMP)     += smp.o
 obj-$(CONFIG_PA11)     += pci-dma.o
 obj-$(CONFIG_PCI)      += pci.o
 obj-$(CONFIG_MODULES)  += module.o
-obj-$(CONFIG_64BIT)    += binfmt_elf32.o sys_parisc32.o signal32.o
+obj-$(CONFIG_64BIT)    += sys_parisc32.o signal32.o
 obj-$(CONFIG_STACKTRACE)+= stacktrace.o
 obj-$(CONFIG_AUDIT)    += audit.o
 obj64-$(CONFIG_AUDIT)  += compat_audit.o
index 949c977..c25775f 100644 (file)
@@ -1013,6 +1013,7 @@ struct kvm_x86_ops {
 
        bool (*has_wbinvd_exit)(void);
 
+       u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu);
        void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
        void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
index d41d896..c9b1402 100644 (file)
@@ -166,7 +166,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
                 */
                pte_prot = __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL);
                /* Filter out unsuppored __PAGE_KERNEL* bits: */
-               pgprot_val(pte_prot) |= __supported_pte_mask;
+               pgprot_val(pte_prot) &= __supported_pte_mask;
                pte = pfn_pte(pfn, pte_prot);
                set_pte_at(mm, va, ptep, pte);
                pte_unmap_unlock(ptep, ptl);
index b58787d..1fc05e4 100644 (file)
@@ -1423,12 +1423,23 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
        seg->base = 0;
 }
 
+static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (is_guest_mode(vcpu))
+               return svm->nested.hsave->control.tsc_offset;
+
+       return vcpu->arch.tsc_offset;
+}
+
 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
        u64 g_tsc_offset = 0;
 
        if (is_guest_mode(vcpu)) {
+               /* Write L1's TSC offset.  */
                g_tsc_offset = svm->vmcb->control.tsc_offset -
                               svm->nested.hsave->control.tsc_offset;
                svm->nested.hsave->control.tsc_offset = offset;
@@ -3322,6 +3333,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
        /* Restore the original control entries */
        copy_vmcb_control_area(vmcb, hsave);
 
+       svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset;
        kvm_clear_exception_queue(&svm->vcpu);
        kvm_clear_interrupt_queue(&svm->vcpu);
 
@@ -3482,10 +3494,12 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
        /* We don't want to see VMMCALLs from a nested guest */
        clr_intercept(svm, INTERCEPT_VMMCALL);
 
+       svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset;
+       svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+
        svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext;
        svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
        svm->vmcb->control.int_state = nested_vmcb->control.int_state;
-       svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
        svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
        svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
 
@@ -4035,12 +4049,6 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        struct vcpu_svm *svm = to_svm(vcpu);
 
        switch (msr_info->index) {
-       case MSR_IA32_TSC: {
-               msr_info->data = svm->vmcb->control.tsc_offset +
-                       kvm_scale_tsc(vcpu, rdtsc());
-
-               break;
-       }
        case MSR_STAR:
                msr_info->data = svm->vmcb->save.star;
                break;
@@ -4193,9 +4201,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                svm->vmcb->save.g_pat = data;
                mark_dirty(svm->vmcb, VMCB_NPT);
                break;
-       case MSR_IA32_TSC:
-               kvm_write_tsc(vcpu, msr);
-               break;
        case MSR_IA32_SPEC_CTRL:
                if (!msr->host_initiated &&
                    !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
@@ -5265,9 +5270,8 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
                }
 
                if (!ret && svm) {
-                       trace_kvm_pi_irte_update(svm->vcpu.vcpu_id,
-                                                host_irq, e->gsi,
-                                                vcpu_info.vector,
+                       trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
+                                                e->gsi, vcpu_info.vector,
                                                 vcpu_info.pi_desc_addr, set);
                }
 
@@ -7102,6 +7106,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 
        .has_wbinvd_exit = svm_has_wbinvd_exit,
 
+       .read_l1_tsc_offset = svm_read_l1_tsc_offset,
        .write_tsc_offset = svm_write_tsc_offset,
 
        .set_tdp_cr3 = set_tdp_cr3,
index aafcc98..aa66ccd 100644 (file)
@@ -2880,18 +2880,15 @@ static void setup_msrs(struct vcpu_vmx *vmx)
                vmx_update_msr_bitmap(&vmx->vcpu);
 }
 
-/*
- * reads and returns guest's timestamp counter "register"
- * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
- * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3
- */
-static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
+static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
 {
-       u64 host_tsc, tsc_offset;
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
-       host_tsc = rdtsc();
-       tsc_offset = vmcs_read64(TSC_OFFSET);
-       return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
+       if (is_guest_mode(vcpu) &&
+           (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
+               return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
+
+       return vcpu->arch.tsc_offset;
 }
 
 /*
@@ -3524,9 +3521,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 #endif
        case MSR_EFER:
                return kvm_get_msr_common(vcpu, msr_info);
-       case MSR_IA32_TSC:
-               msr_info->data = guest_read_tsc(vcpu);
-               break;
        case MSR_IA32_SPEC_CTRL:
                if (!msr_info->host_initiated &&
                    !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
@@ -3646,9 +3640,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                vmcs_write64(GUEST_BNDCFGS, data);
                break;
-       case MSR_IA32_TSC:
-               kvm_write_tsc(vcpu, msr_info);
-               break;
        case MSR_IA32_SPEC_CTRL:
                if (!msr_info->host_initiated &&
                    !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
@@ -10608,6 +10599,16 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
        return true;
 }
 
+static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
+                                         struct vmcs12 *vmcs12)
+{
+       if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+           !page_address_valid(vcpu, vmcs12->apic_access_addr))
+               return -EINVAL;
+       else
+               return 0;
+}
+
 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
                                           struct vmcs12 *vmcs12)
 {
@@ -11176,11 +11177,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
        }
 
-       if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
-               vmcs_write64(TSC_OFFSET,
-                       vcpu->arch.tsc_offset + vmcs12->tsc_offset);
-       else
-               vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+       vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+
        if (kvm_has_tsc_control)
                decache_tsc_multiplier(vmx);
 
@@ -11299,6 +11297,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
                return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
+       if (nested_vmx_check_apic_access_controls(vcpu, vmcs12))
+               return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
        if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12))
                return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
@@ -11420,6 +11421,7 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        u32 msr_entry_idx;
        u32 exit_qual;
+       int r;
 
        enter_guest_mode(vcpu);
 
@@ -11429,26 +11431,21 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
        vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
        vmx_segment_cache_clear(vmx);
 
-       if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
-               leave_guest_mode(vcpu);
-               vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-               nested_vmx_entry_failure(vcpu, vmcs12,
-                                        EXIT_REASON_INVALID_STATE, exit_qual);
-               return 1;
-       }
+       if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+               vcpu->arch.tsc_offset += vmcs12->tsc_offset;
+
+       r = EXIT_REASON_INVALID_STATE;
+       if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual))
+               goto fail;
 
        nested_get_vmcs12_pages(vcpu, vmcs12);
 
+       r = EXIT_REASON_MSR_LOAD_FAIL;
        msr_entry_idx = nested_vmx_load_msr(vcpu,
                                            vmcs12->vm_entry_msr_load_addr,
                                            vmcs12->vm_entry_msr_load_count);
-       if (msr_entry_idx) {
-               leave_guest_mode(vcpu);
-               vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-               nested_vmx_entry_failure(vcpu, vmcs12,
-                               EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
-               return 1;
-       }
+       if (msr_entry_idx)
+               goto fail;
 
        /*
         * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
@@ -11457,6 +11454,14 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
         * the success flag) when L2 exits (see nested_vmx_vmexit()).
         */
        return 0;
+
+fail:
+       if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+               vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+       leave_guest_mode(vcpu);
+       vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+       nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual);
+       return 1;
 }
 
 /*
@@ -12028,6 +12033,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
        leave_guest_mode(vcpu);
 
+       if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+               vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+
        if (likely(!vmx->fail)) {
                if (exit_reason == -1)
                        sync_vmcs12(vcpu, vmcs12);
@@ -12224,10 +12232,16 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
 
 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
 {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       u64 tscl = rdtsc();
-       u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
-       u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+       struct vcpu_vmx *vmx;
+       u64 tscl, guest_tscl, delta_tsc;
+
+       if (kvm_mwait_in_guest(vcpu->kvm))
+               return -EOPNOTSUPP;
+
+       vmx = to_vmx(vcpu);
+       tscl = rdtsc();
+       guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+       delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
 
        /* Convert to host delta tsc if tsc scaling is enabled */
        if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
@@ -12533,7 +12547,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
                vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
                vcpu_info.vector = irq.vector;
 
-               trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
+               trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
                                vcpu_info.vector, vcpu_info.pi_desc_addr, set);
 
                if (set)
@@ -12712,6 +12726,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
        .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
+       .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
        .write_tsc_offset = vmx_write_tsc_offset,
 
        .set_tdp_cr3 = vmx_set_cr3,
index b2ff74b..51ecd38 100644 (file)
@@ -1490,7 +1490,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
 
 static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
 {
-       u64 curr_offset = vcpu->arch.tsc_offset;
+       u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
        vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
 }
 
@@ -1532,7 +1532,9 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 
 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 {
-       return vcpu->arch.tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
+       u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
+
+       return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
 }
 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
 
@@ -2362,6 +2364,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                vcpu->arch.smbase = data;
                break;
+       case MSR_IA32_TSC:
+               kvm_write_tsc(vcpu, msr_info);
+               break;
        case MSR_SMI_COUNT:
                if (!msr_info->host_initiated)
                        return 1;
@@ -2605,6 +2610,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_UCODE_REV:
                msr_info->data = vcpu->arch.microcode_version;
                break;
+       case MSR_IA32_TSC:
+               msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
+               break;
        case MSR_MTRRcap:
        case 0x200 ... 0x2ff:
                return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
@@ -2819,7 +2827,8 @@ out:
 static inline bool kvm_can_mwait_in_guest(void)
 {
        return boot_cpu_has(X86_FEATURE_MWAIT) &&
-               !boot_cpu_has_bug(X86_BUG_MONITOR);
+               !boot_cpu_has_bug(X86_BUG_MONITOR) &&
+               boot_cpu_has(X86_FEATURE_ARAT);
 }
 
 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
index 07dc541..8e8b04c 100644 (file)
@@ -732,6 +732,7 @@ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
  */
 enum {
        Opt_queue_depth,
+       Opt_lock_timeout,
        Opt_last_int,
        /* int args above */
        Opt_last_string,
@@ -740,11 +741,13 @@ enum {
        Opt_read_write,
        Opt_lock_on_read,
        Opt_exclusive,
+       Opt_notrim,
        Opt_err
 };
 
 static match_table_t rbd_opts_tokens = {
        {Opt_queue_depth, "queue_depth=%d"},
+       {Opt_lock_timeout, "lock_timeout=%d"},
        /* int args above */
        /* string args above */
        {Opt_read_only, "read_only"},
@@ -753,20 +756,25 @@ static match_table_t rbd_opts_tokens = {
        {Opt_read_write, "rw"},         /* Alternate spelling */
        {Opt_lock_on_read, "lock_on_read"},
        {Opt_exclusive, "exclusive"},
+       {Opt_notrim, "notrim"},
        {Opt_err, NULL}
 };
 
 struct rbd_options {
        int     queue_depth;
+       unsigned long   lock_timeout;
        bool    read_only;
        bool    lock_on_read;
        bool    exclusive;
+       bool    trim;
 };
 
 #define RBD_QUEUE_DEPTH_DEFAULT        BLKDEV_MAX_RQ
+#define RBD_LOCK_TIMEOUT_DEFAULT 0  /* no timeout */
 #define RBD_READ_ONLY_DEFAULT  false
 #define RBD_LOCK_ON_READ_DEFAULT false
 #define RBD_EXCLUSIVE_DEFAULT  false
+#define RBD_TRIM_DEFAULT       true
 
 static int parse_rbd_opts_token(char *c, void *private)
 {
@@ -796,6 +804,14 @@ static int parse_rbd_opts_token(char *c, void *private)
                }
                rbd_opts->queue_depth = intval;
                break;
+       case Opt_lock_timeout:
+               /* 0 is "wait forever" (i.e. infinite timeout) */
+               if (intval < 0 || intval > INT_MAX / 1000) {
+                       pr_err("lock_timeout out of range\n");
+                       return -EINVAL;
+               }
+               rbd_opts->lock_timeout = msecs_to_jiffies(intval * 1000);
+               break;
        case Opt_read_only:
                rbd_opts->read_only = true;
                break;
@@ -808,6 +824,9 @@ static int parse_rbd_opts_token(char *c, void *private)
        case Opt_exclusive:
                rbd_opts->exclusive = true;
                break;
+       case Opt_notrim:
+               rbd_opts->trim = false;
+               break;
        default:
                /* libceph prints "bad option" msg */
                return -EINVAL;
@@ -1392,7 +1411,7 @@ static bool rbd_img_is_write(struct rbd_img_request *img_req)
        case OBJ_OP_DISCARD:
                return true;
        default:
-               rbd_assert(0);
+               BUG();
        }
 }
 
@@ -2466,7 +2485,7 @@ again:
                }
                return false;
        default:
-               rbd_assert(0);
+               BUG();
        }
 }
 
@@ -2494,7 +2513,7 @@ static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req)
                }
                return false;
        default:
-               rbd_assert(0);
+               BUG();
        }
 }
 
@@ -3533,9 +3552,22 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
 /*
  * lock_rwsem must be held for read
  */
-static void rbd_wait_state_locked(struct rbd_device *rbd_dev)
+static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
 {
        DEFINE_WAIT(wait);
+       unsigned long timeout;
+       int ret = 0;
+
+       if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
+               return -EBLACKLISTED;
+
+       if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
+               return 0;
+
+       if (!may_acquire) {
+               rbd_warn(rbd_dev, "exclusive lock required");
+               return -EROFS;
+       }
 
        do {
                /*
@@ -3547,12 +3579,22 @@ static void rbd_wait_state_locked(struct rbd_device *rbd_dev)
                prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
                                          TASK_UNINTERRUPTIBLE);
                up_read(&rbd_dev->lock_rwsem);
-               schedule();
+               timeout = schedule_timeout(ceph_timeout_jiffies(
+                                               rbd_dev->opts->lock_timeout));
                down_read(&rbd_dev->lock_rwsem);
-       } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
-                !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags));
+               if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
+                       ret = -EBLACKLISTED;
+                       break;
+               }
+               if (!timeout) {
+                       rbd_warn(rbd_dev, "timed out waiting for lock");
+                       ret = -ETIMEDOUT;
+                       break;
+               }
+       } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
 
        finish_wait(&rbd_dev->lock_waitq, &wait);
+       return ret;
 }
 
 static void rbd_queue_workfn(struct work_struct *work)
@@ -3638,19 +3680,10 @@ static void rbd_queue_workfn(struct work_struct *work)
            (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
        if (must_be_locked) {
                down_read(&rbd_dev->lock_rwsem);
-               if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
-                   !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
-                       if (rbd_dev->opts->exclusive) {
-                               rbd_warn(rbd_dev, "exclusive lock required");
-                               result = -EROFS;
-                               goto err_unlock;
-                       }
-                       rbd_wait_state_locked(rbd_dev);
-               }
-               if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
-                       result = -EBLACKLISTED;
+               result = rbd_wait_state_locked(rbd_dev,
+                                              !rbd_dev->opts->exclusive);
+               if (result)
                        goto err_unlock;
-               }
        }
 
        img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
@@ -3902,7 +3935,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 {
        struct gendisk *disk;
        struct request_queue *q;
-       u64 segment_size;
+       unsigned int objset_bytes =
+           rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
        int err;
 
        /* create gendisk info */
@@ -3942,20 +3976,19 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
        blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
        /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
 
-       /* set io sizes to object size */
-       segment_size = rbd_obj_bytes(&rbd_dev->header);
-       blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
+       blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
        q->limits.max_sectors = queue_max_hw_sectors(q);
        blk_queue_max_segments(q, USHRT_MAX);
        blk_queue_max_segment_size(q, UINT_MAX);
-       blk_queue_io_min(q, segment_size);
-       blk_queue_io_opt(q, segment_size);
+       blk_queue_io_min(q, objset_bytes);
+       blk_queue_io_opt(q, objset_bytes);
 
-       /* enable the discard support */
-       blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
-       q->limits.discard_granularity = segment_size;
-       blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
-       blk_queue_max_write_zeroes_sectors(q, segment_size / SECTOR_SIZE);
+       if (rbd_dev->opts->trim) {
+               blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
+               q->limits.discard_granularity = objset_bytes;
+               blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
+               blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
+       }
 
        if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
                q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
@@ -5179,8 +5212,10 @@ static int rbd_add_parse_args(const char *buf,
 
        rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
        rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
+       rbd_opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
        rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
        rbd_opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
+       rbd_opts->trim = RBD_TRIM_DEFAULT;
 
        copts = ceph_parse_options(options, mon_addrs,
                                        mon_addrs + mon_addrs_size - 1,
@@ -5216,6 +5251,8 @@ static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
 
 static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
 {
+       int ret;
+
        if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
                rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
                return -EINVAL;
@@ -5223,9 +5260,9 @@ static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
 
        /* FIXME: "rbd map --exclusive" should be in interruptible */
        down_read(&rbd_dev->lock_rwsem);
-       rbd_wait_state_locked(rbd_dev);
+       ret = rbd_wait_state_locked(rbd_dev, true);
        up_read(&rbd_dev->lock_rwsem);
-       if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
+       if (ret) {
                rbd_warn(rbd_dev, "failed to acquire exclusive lock");
                return -EROFS;
        }
index 9ee2888..8e8a097 100644 (file)
@@ -133,6 +133,14 @@ config VT8500_TIMER
        help
          Enables support for the VT8500 driver.
 
+config NPCM7XX_TIMER
+       bool "NPCM7xx timer driver" if COMPILE_TEST
+       depends on HAS_IOMEM
+       select CLKSRC_MMIO
+       help
+         Enable 24-bit TIMER0 and TIMER1 counters in the NPCM7xx architecture,
+         While TIMER0 serves as clockevent and TIMER1 serves as clocksource.
+
 config CADENCE_TTC_TIMER
        bool "Cadence TTC timer driver" if COMPILE_TEST
        depends on COMMON_CLK
index e8e76df..00caf37 100644 (file)
@@ -56,6 +56,7 @@ obj-$(CONFIG_CLKSRC_NPS)      += timer-nps.o
 obj-$(CONFIG_OXNAS_RPS_TIMER)  += timer-oxnas-rps.o
 obj-$(CONFIG_OWL_TIMER)                += owl-timer.o
 obj-$(CONFIG_SPRD_TIMER)       += timer-sprd.o
+obj-$(CONFIG_NPCM7XX_TIMER)    += timer-npcm7xx.o
 
 obj-$(CONFIG_ARC_TIMERS)               += arc_timer.o
 obj-$(CONFIG_ARM_ARCH_TIMER)           += arm_arch_timer.o
index 21bffdc..05d97a6 100644 (file)
 #include <linux/of_irq.h>
 #include <linux/sched_clock.h>
 
+#define TPM_PARAM                      0x4
+#define TPM_PARAM_WIDTH_SHIFT          16
+#define TPM_PARAM_WIDTH_MASK           (0xff << 16)
 #define TPM_SC                         0x10
 #define TPM_SC_CMOD_INC_PER_CNT                (0x1 << 3)
 #define TPM_SC_CMOD_DIV_DEFAULT                0x3
+#define TPM_SC_CMOD_DIV_MAX            0x7
+#define TPM_SC_TOF_MASK                        (0x1 << 7)
 #define TPM_CNT                                0x14
 #define TPM_MOD                                0x18
 #define TPM_STATUS                     0x1c
 #define TPM_C0SC_MODE_SHIFT            2
 #define TPM_C0SC_MODE_MASK             0x3c
 #define TPM_C0SC_MODE_SW_COMPARE       0x4
+#define TPM_C0SC_CHF_MASK              (0x1 << 7)
 #define TPM_C0V                                0x24
 
+static int counter_width;
+static int rating;
 static void __iomem *timer_base;
 static struct clock_event_device clockevent_tpm;
 
@@ -83,10 +91,11 @@ static int __init tpm_clocksource_init(unsigned long rate)
        tpm_delay_timer.freq = rate;
        register_current_timer_delay(&tpm_delay_timer);
 
-       sched_clock_register(tpm_read_sched_clock, 32, rate);
+       sched_clock_register(tpm_read_sched_clock, counter_width, rate);
 
        return clocksource_mmio_init(timer_base + TPM_CNT, "imx-tpm",
-                                    rate, 200, 32, clocksource_mmio_readl_up);
+                                    rate, rating, counter_width,
+                                    clocksource_mmio_readl_up);
 }
 
 static int tpm_set_next_event(unsigned long delta,
@@ -139,7 +148,6 @@ static struct clock_event_device clockevent_tpm = {
        .set_state_oneshot      = tpm_set_state_oneshot,
        .set_next_event         = tpm_set_next_event,
        .set_state_shutdown     = tpm_set_state_shutdown,
-       .rating                 = 200,
 };
 
 static int __init tpm_clockevent_init(unsigned long rate, int irq)
@@ -149,10 +157,11 @@ static int __init tpm_clockevent_init(unsigned long rate, int irq)
        ret = request_irq(irq, tpm_timer_interrupt, IRQF_TIMER | IRQF_IRQPOLL,
                          "i.MX7ULP TPM Timer", &clockevent_tpm);
 
+       clockevent_tpm.rating = rating;
        clockevent_tpm.cpumask = cpumask_of(0);
        clockevent_tpm.irq = irq;
-       clockevents_config_and_register(&clockevent_tpm,
-                                       rate, 300, 0xfffffffe);
+       clockevents_config_and_register(&clockevent_tpm, rate, 300,
+                                       GENMASK(counter_width - 1, 1));
 
        return ret;
 }
@@ -179,7 +188,7 @@ static int __init tpm_timer_init(struct device_node *np)
        ipg = of_clk_get_by_name(np, "ipg");
        per = of_clk_get_by_name(np, "per");
        if (IS_ERR(ipg) || IS_ERR(per)) {
-               pr_err("tpm: failed to get igp or per clk\n");
+               pr_err("tpm: failed to get ipg or per clk\n");
                ret = -ENODEV;
                goto err_clk_get;
        }
@@ -197,6 +206,11 @@ static int __init tpm_timer_init(struct device_node *np)
                goto err_per_clk_enable;
        }
 
+       counter_width = (readl(timer_base + TPM_PARAM) & TPM_PARAM_WIDTH_MASK)
+               >> TPM_PARAM_WIDTH_SHIFT;
+       /* use rating 200 for 32-bit counter and 150 for 16-bit counter */
+       rating = counter_width == 0x20 ? 200 : 150;
+
        /*
         * Initialize tpm module to a known state
         * 1) Counter disabled
@@ -205,16 +219,25 @@ static int __init tpm_timer_init(struct device_node *np)
         * 4) Channel0 disabled
         * 5) DMA transfers disabled
         */
+       /* make sure counter is disabled */
        writel(0, timer_base + TPM_SC);
+       /* TOF is W1C */
+       writel(TPM_SC_TOF_MASK, timer_base + TPM_SC);
        writel(0, timer_base + TPM_CNT);
-       writel(0, timer_base + TPM_C0SC);
+       /* CHF is W1C */
+       writel(TPM_C0SC_CHF_MASK, timer_base + TPM_C0SC);
 
-       /* increase per cnt, div 8 by default */
-       writel(TPM_SC_CMOD_INC_PER_CNT | TPM_SC_CMOD_DIV_DEFAULT,
+       /*
+        * increase per cnt,
+        * div 8 for 32-bit counter and div 128 for 16-bit counter
+        */
+       writel(TPM_SC_CMOD_INC_PER_CNT |
+               (counter_width == 0x20 ?
+               TPM_SC_CMOD_DIV_DEFAULT : TPM_SC_CMOD_DIV_MAX),
                     timer_base + TPM_SC);
 
        /* set MOD register to maximum for free running mode */
-       writel(0xffffffff, timer_base + TPM_MOD);
+       writel(GENMASK(counter_width - 1, 0), timer_base + TPM_MOD);
 
        rate = clk_get_rate(per) >> 3;
        ret = tpm_clocksource_init(rate);
diff --git a/drivers/clocksource/timer-npcm7xx.c b/drivers/clocksource/timer-npcm7xx.c
new file mode 100644 (file)
index 0000000..7a9bb55
--- /dev/null
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2014-2018 Nuvoton Technologies tomer.maimon@nuvoton.com
+ * All rights reserved.
+ *
+ * Copyright 2017 Google, Inc.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/err.h>
+#include <linux/clk.h>
+#include <linux/io.h>
+#include <linux/clockchips.h>
+#include <linux/of_irq.h>
+#include <linux/of_address.h>
+#include "timer-of.h"
+
+/* Timers registers */
+#define NPCM7XX_REG_TCSR0      0x0 /* Timer 0 Control and Status Register */
+#define NPCM7XX_REG_TICR0      0x8 /* Timer 0 Initial Count Register */
+#define NPCM7XX_REG_TCSR1      0x4 /* Timer 1 Control and Status Register */
+#define NPCM7XX_REG_TICR1      0xc /* Timer 1 Initial Count Register */
+#define NPCM7XX_REG_TDR1       0x14 /* Timer 1 Data Register */
+#define NPCM7XX_REG_TISR       0x18 /* Timer Interrupt Status Register */
+
+/* Timers control */
+#define NPCM7XX_Tx_RESETINT            0x1f
+#define NPCM7XX_Tx_PERIOD              BIT(27)
+#define NPCM7XX_Tx_INTEN               BIT(29)
+#define NPCM7XX_Tx_COUNTEN             BIT(30)
+#define NPCM7XX_Tx_ONESHOT             0x0
+#define NPCM7XX_Tx_OPER                        GENMASK(3, 27)
+#define NPCM7XX_Tx_MIN_PRESCALE                0x1
+#define NPCM7XX_Tx_TDR_MASK_BITS       24
+#define NPCM7XX_Tx_MAX_CNT             0xFFFFFF
+#define NPCM7XX_T0_CLR_INT             0x1
+#define NPCM7XX_Tx_CLR_CSR             0x0
+
+/* Timers operating mode */
+#define NPCM7XX_START_PERIODIC_Tx (NPCM7XX_Tx_PERIOD | NPCM7XX_Tx_COUNTEN | \
+                                       NPCM7XX_Tx_INTEN | \
+                                       NPCM7XX_Tx_MIN_PRESCALE)
+
+#define NPCM7XX_START_ONESHOT_Tx (NPCM7XX_Tx_ONESHOT | NPCM7XX_Tx_COUNTEN | \
+                                       NPCM7XX_Tx_INTEN | \
+                                       NPCM7XX_Tx_MIN_PRESCALE)
+
+#define NPCM7XX_START_Tx (NPCM7XX_Tx_COUNTEN | NPCM7XX_Tx_PERIOD | \
+                               NPCM7XX_Tx_MIN_PRESCALE)
+
+#define NPCM7XX_DEFAULT_CSR (NPCM7XX_Tx_CLR_CSR | NPCM7XX_Tx_MIN_PRESCALE)
+
+static int npcm7xx_timer_resume(struct clock_event_device *evt)
+{
+       struct timer_of *to = to_timer_of(evt);
+       u32 val;
+
+       val = readl(timer_of_base(to) + NPCM7XX_REG_TCSR0);
+       val |= NPCM7XX_Tx_COUNTEN;
+       writel(val, timer_of_base(to) + NPCM7XX_REG_TCSR0);
+
+       return 0;
+}
+
+static int npcm7xx_timer_shutdown(struct clock_event_device *evt)
+{
+       struct timer_of *to = to_timer_of(evt);
+       u32 val;
+
+       val = readl(timer_of_base(to) + NPCM7XX_REG_TCSR0);
+       val &= ~NPCM7XX_Tx_COUNTEN;
+       writel(val, timer_of_base(to) + NPCM7XX_REG_TCSR0);
+
+       return 0;
+}
+
+static int npcm7xx_timer_oneshot(struct clock_event_device *evt)
+{
+       struct timer_of *to = to_timer_of(evt);
+       u32 val;
+
+       val = readl(timer_of_base(to) + NPCM7XX_REG_TCSR0);
+       val &= ~NPCM7XX_Tx_OPER;
+
+       val = readl(timer_of_base(to) + NPCM7XX_REG_TCSR0);
+       val |= NPCM7XX_START_ONESHOT_Tx;
+       writel(val, timer_of_base(to) + NPCM7XX_REG_TCSR0);
+
+       return 0;
+}
+
+static int npcm7xx_timer_periodic(struct clock_event_device *evt)
+{
+       struct timer_of *to = to_timer_of(evt);
+       u32 val;
+
+       val = readl(timer_of_base(to) + NPCM7XX_REG_TCSR0);
+       val &= ~NPCM7XX_Tx_OPER;
+
+       writel(timer_of_period(to), timer_of_base(to) + NPCM7XX_REG_TICR0);
+       val |= NPCM7XX_START_PERIODIC_Tx;
+
+       writel(val, timer_of_base(to) + NPCM7XX_REG_TCSR0);
+
+       return 0;
+}
+
+static int npcm7xx_clockevent_set_next_event(unsigned long evt,
+               struct clock_event_device *clk)
+{
+       struct timer_of *to = to_timer_of(clk);
+       u32 val;
+
+       writel(evt, timer_of_base(to) + NPCM7XX_REG_TICR0);
+       val = readl(timer_of_base(to) + NPCM7XX_REG_TCSR0);
+       val |= NPCM7XX_START_Tx;
+       writel(val, timer_of_base(to) + NPCM7XX_REG_TCSR0);
+
+       return 0;
+}
+
+static irqreturn_t npcm7xx_timer0_interrupt(int irq, void *dev_id)
+{
+       struct clock_event_device *evt = (struct clock_event_device *)dev_id;
+       struct timer_of *to = to_timer_of(evt);
+
+       writel(NPCM7XX_T0_CLR_INT, timer_of_base(to) + NPCM7XX_REG_TISR);
+
+       evt->event_handler(evt);
+
+       return IRQ_HANDLED;
+}
+
+static struct timer_of npcm7xx_to = {
+       .flags = TIMER_OF_IRQ | TIMER_OF_BASE | TIMER_OF_CLOCK,
+
+       .clkevt = {
+               .name               = "npcm7xx-timer0",
+               .features           = CLOCK_EVT_FEAT_PERIODIC |
+                                     CLOCK_EVT_FEAT_ONESHOT,
+               .set_next_event     = npcm7xx_clockevent_set_next_event,
+               .set_state_shutdown = npcm7xx_timer_shutdown,
+               .set_state_periodic = npcm7xx_timer_periodic,
+               .set_state_oneshot  = npcm7xx_timer_oneshot,
+               .tick_resume        = npcm7xx_timer_resume,
+               .rating             = 300,
+       },
+
+       .of_irq = {
+               .handler = npcm7xx_timer0_interrupt,
+               .flags = IRQF_TIMER | IRQF_IRQPOLL,
+       },
+};
+
+static void __init npcm7xx_clockevents_init(void)
+{
+       writel(NPCM7XX_DEFAULT_CSR,
+               timer_of_base(&npcm7xx_to) + NPCM7XX_REG_TCSR0);
+
+       writel(NPCM7XX_Tx_RESETINT,
+               timer_of_base(&npcm7xx_to) + NPCM7XX_REG_TISR);
+
+       npcm7xx_to.clkevt.cpumask = cpumask_of(0);
+       clockevents_config_and_register(&npcm7xx_to.clkevt,
+                                       timer_of_rate(&npcm7xx_to),
+                                       0x1, NPCM7XX_Tx_MAX_CNT);
+}
+
+static void __init npcm7xx_clocksource_init(void)
+{
+       u32 val;
+
+       writel(NPCM7XX_DEFAULT_CSR,
+               timer_of_base(&npcm7xx_to) + NPCM7XX_REG_TCSR1);
+       writel(NPCM7XX_Tx_MAX_CNT,
+               timer_of_base(&npcm7xx_to) + NPCM7XX_REG_TICR1);
+
+       val = readl(timer_of_base(&npcm7xx_to) + NPCM7XX_REG_TCSR1);
+       val |= NPCM7XX_START_Tx;
+       writel(val, timer_of_base(&npcm7xx_to) + NPCM7XX_REG_TCSR1);
+
+       clocksource_mmio_init(timer_of_base(&npcm7xx_to) +
+                               NPCM7XX_REG_TDR1,
+                               "npcm7xx-timer1", timer_of_rate(&npcm7xx_to),
+                               200, (unsigned int)NPCM7XX_Tx_TDR_MASK_BITS,
+                               clocksource_mmio_readl_down);
+}
+
+static int __init npcm7xx_timer_init(struct device_node *np)
+{
+       int ret;
+
+       ret = timer_of_init(np, &npcm7xx_to);
+       if (ret)
+               return ret;
+
+       /* Clock input is divided by PRESCALE + 1 before it is fed */
+       /* to the counter */
+       npcm7xx_to.of_clk.rate = npcm7xx_to.of_clk.rate /
+               (NPCM7XX_Tx_MIN_PRESCALE + 1);
+
+       npcm7xx_clocksource_init();
+       npcm7xx_clockevents_init();
+
+       pr_info("Enabling NPCM7xx clocksource timer base: %px, IRQ: %d ",
+               timer_of_base(&npcm7xx_to), timer_of_irq(&npcm7xx_to));
+
+       return 0;
+}
+
+TIMER_OF_DECLARE(npcm7xx, "nuvoton,npcm750-timer", npcm7xx_timer_init);
+
index 8bf6025..ae05692 100644 (file)
@@ -669,13 +669,15 @@ void ceph_fill_file_time(struct inode *inode, int issued,
                      CEPH_CAP_FILE_BUFFER|
                      CEPH_CAP_AUTH_EXCL|
                      CEPH_CAP_XATTR_EXCL)) {
-               if (timespec_compare(ctime, &inode->i_ctime) > 0) {
+               if (ci->i_version == 0 ||
+                   timespec_compare(ctime, &inode->i_ctime) > 0) {
                        dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
                             inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
                             ctime->tv_sec, ctime->tv_nsec);
                        inode->i_ctime = *ctime;
                }
-               if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
+               if (ci->i_version == 0 ||
+                   ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
                        /* the MDS did a utimes() */
                        dout("mtime %ld.%09ld -> %ld.%09ld "
                             "tw %d -> %d\n",
@@ -795,7 +797,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
        new_issued = ~issued & le32_to_cpu(info->cap.caps);
 
        /* update inode */
-       ci->i_version = le64_to_cpu(info->version);
        inode->i_rdev = le32_to_cpu(info->rdev);
        inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
 
@@ -868,6 +869,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
                xattr_blob = NULL;
        }
 
+       /* finally update i_version */
+       ci->i_version = le64_to_cpu(info->version);
+
        inode->i_mapping->a_ops = &ceph_aops;
 
        switch (inode->i_mode & S_IFMT) {
index dc44de9..2ddcc96 100644 (file)
@@ -4,17 +4,18 @@ top_srcdir = ../../../../
 UNAME_M := $(shell uname -m)
 
 LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c
-LIBKVM_x86_64 = lib/x86.c
+LIBKVM_x86_64 = lib/x86.c lib/vmx.c
 
 TEST_GEN_PROGS_x86_64 = set_sregs_test
 TEST_GEN_PROGS_x86_64 += sync_regs_test
+TEST_GEN_PROGS_x86_64 += vmx_tsc_adjust_test
 
 TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
 LIBKVM += $(LIBKVM_$(UNAME_M))
 
 INSTALL_HDR_PATH = $(top_srcdir)/usr
 LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/
-CFLAGS += -O2 -g -I$(LINUX_HDR_PATH) -Iinclude -I$(<D)
+CFLAGS += -O2 -g -std=gnu99 -I$(LINUX_HDR_PATH) -Iinclude -I$(<D)
 
 # After inclusion, $(OUTPUT) is defined and
 # $(TEST_GEN_PROGS) starts with $(OUTPUT)/
index 57974ad..637b701 100644 (file)
@@ -112,24 +112,27 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
 vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm,
        vm_paddr_t paddr_min, uint32_t memslot);
 
-void kvm_get_supported_cpuid(struct kvm_cpuid2 *cpuid);
+struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
 void vcpu_set_cpuid(
        struct kvm_vm *vm, uint32_t vcpuid, struct kvm_cpuid2 *cpuid);
 
-struct kvm_cpuid2 *allocate_kvm_cpuid2(void);
 struct kvm_cpuid_entry2 *
-find_cpuid_index_entry(struct kvm_cpuid2 *cpuid, uint32_t function,
-                      uint32_t index);
+kvm_get_supported_cpuid_index(uint32_t function, uint32_t index);
 
 static inline struct kvm_cpuid_entry2 *
-find_cpuid_entry(struct kvm_cpuid2 *cpuid, uint32_t function)
+kvm_get_supported_cpuid_entry(uint32_t function)
 {
-       return find_cpuid_index_entry(cpuid, function, 0);
+       return kvm_get_supported_cpuid_index(function, 0);
 }
 
 struct kvm_vm *vm_create_default(uint32_t vcpuid, void *guest_code);
 void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code);
 
+typedef void (*vmx_guest_code_t)(vm_vaddr_t vmxon_vaddr,
+                                vm_paddr_t vmxon_paddr,
+                                vm_vaddr_t vmcs_vaddr,
+                                vm_paddr_t vmcs_paddr);
+
 struct kvm_userspace_memory_region *
 kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
                                 uint64_t end);
diff --git a/tools/testing/selftests/kvm/include/vmx.h b/tools/testing/selftests/kvm/include/vmx.h
new file mode 100644 (file)
index 0000000..6ed8499
--- /dev/null
@@ -0,0 +1,494 @@
+/*
+ * tools/testing/selftests/kvm/include/vmx.h
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ */
+
+#ifndef SELFTEST_KVM_VMX_H
+#define SELFTEST_KVM_VMX_H
+
+#include <stdint.h>
+#include "x86.h"
+
+#define CPUID_VMX_BIT                          5
+
+#define CPUID_VMX                              (1 << 5)
+
+/*
+ * Definitions of Primary Processor-Based VM-Execution Controls.
+ */
+#define CPU_BASED_VIRTUAL_INTR_PENDING         0x00000004
+#define CPU_BASED_USE_TSC_OFFSETING            0x00000008
+#define CPU_BASED_HLT_EXITING                  0x00000080
+#define CPU_BASED_INVLPG_EXITING               0x00000200
+#define CPU_BASED_MWAIT_EXITING                        0x00000400
+#define CPU_BASED_RDPMC_EXITING                        0x00000800
+#define CPU_BASED_RDTSC_EXITING                        0x00001000
+#define CPU_BASED_CR3_LOAD_EXITING             0x00008000
+#define CPU_BASED_CR3_STORE_EXITING            0x00010000
+#define CPU_BASED_CR8_LOAD_EXITING             0x00080000
+#define CPU_BASED_CR8_STORE_EXITING            0x00100000
+#define CPU_BASED_TPR_SHADOW                   0x00200000
+#define CPU_BASED_VIRTUAL_NMI_PENDING          0x00400000
+#define CPU_BASED_MOV_DR_EXITING               0x00800000
+#define CPU_BASED_UNCOND_IO_EXITING            0x01000000
+#define CPU_BASED_USE_IO_BITMAPS               0x02000000
+#define CPU_BASED_MONITOR_TRAP                 0x08000000
+#define CPU_BASED_USE_MSR_BITMAPS              0x10000000
+#define CPU_BASED_MONITOR_EXITING              0x20000000
+#define CPU_BASED_PAUSE_EXITING                        0x40000000
+#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS  0x80000000
+
+#define CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR    0x0401e172
+
+/*
+ * Definitions of Secondary Processor-Based VM-Execution Controls.
+ */
+#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_ENABLE_EPT              0x00000002
+#define SECONDARY_EXEC_DESC                    0x00000004
+#define SECONDARY_EXEC_RDTSCP                  0x00000008
+#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE  0x00000010
+#define SECONDARY_EXEC_ENABLE_VPID             0x00000020
+#define SECONDARY_EXEC_WBINVD_EXITING          0x00000040
+#define SECONDARY_EXEC_UNRESTRICTED_GUEST      0x00000080
+#define SECONDARY_EXEC_APIC_REGISTER_VIRT      0x00000100
+#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY   0x00000200
+#define SECONDARY_EXEC_PAUSE_LOOP_EXITING      0x00000400
+#define SECONDARY_EXEC_RDRAND_EXITING          0x00000800
+#define SECONDARY_EXEC_ENABLE_INVPCID          0x00001000
+#define SECONDARY_EXEC_ENABLE_VMFUNC           0x00002000
+#define SECONDARY_EXEC_SHADOW_VMCS             0x00004000
+#define SECONDARY_EXEC_RDSEED_EXITING          0x00010000
+#define SECONDARY_EXEC_ENABLE_PML              0x00020000
+#define SECONDARY_EPT_VE                       0x00040000
+#define SECONDARY_ENABLE_XSAV_RESTORE          0x00100000
+#define SECONDARY_EXEC_TSC_SCALING             0x02000000
+
+#define PIN_BASED_EXT_INTR_MASK                        0x00000001
+#define PIN_BASED_NMI_EXITING                  0x00000008
+#define PIN_BASED_VIRTUAL_NMIS                 0x00000020
+#define PIN_BASED_VMX_PREEMPTION_TIMER         0x00000040
+#define PIN_BASED_POSTED_INTR                  0x00000080
+
+#define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR    0x00000016
+
+#define VM_EXIT_SAVE_DEBUG_CONTROLS            0x00000004
+#define VM_EXIT_HOST_ADDR_SPACE_SIZE           0x00000200
+#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL     0x00001000
+#define VM_EXIT_ACK_INTR_ON_EXIT               0x00008000
+#define VM_EXIT_SAVE_IA32_PAT                  0x00040000
+#define VM_EXIT_LOAD_IA32_PAT                  0x00080000
+#define VM_EXIT_SAVE_IA32_EFER                 0x00100000
+#define VM_EXIT_LOAD_IA32_EFER                 0x00200000
+#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER      0x00400000
+
+#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR      0x00036dff
+
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS           0x00000004
+#define VM_ENTRY_IA32E_MODE                    0x00000200
+#define VM_ENTRY_SMM                           0x00000400
+#define VM_ENTRY_DEACT_DUAL_MONITOR            0x00000800
+#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL    0x00002000
+#define VM_ENTRY_LOAD_IA32_PAT                 0x00004000
+#define VM_ENTRY_LOAD_IA32_EFER                        0x00008000
+
+#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR     0x000011ff
+
+#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK    0x0000001f
+#define VMX_MISC_SAVE_EFER_LMA                 0x00000020
+
+#define EXIT_REASON_FAILED_VMENTRY     0x80000000
+#define EXIT_REASON_EXCEPTION_NMI      0
+#define EXIT_REASON_EXTERNAL_INTERRUPT 1
+#define EXIT_REASON_TRIPLE_FAULT       2
+#define EXIT_REASON_PENDING_INTERRUPT  7
+#define EXIT_REASON_NMI_WINDOW         8
+#define EXIT_REASON_TASK_SWITCH                9
+#define EXIT_REASON_CPUID              10
+#define EXIT_REASON_HLT                        12
+#define EXIT_REASON_INVD               13
+#define EXIT_REASON_INVLPG             14
+#define EXIT_REASON_RDPMC              15
+#define EXIT_REASON_RDTSC              16
+#define EXIT_REASON_VMCALL             18
+#define EXIT_REASON_VMCLEAR            19
+#define EXIT_REASON_VMLAUNCH           20
+#define EXIT_REASON_VMPTRLD            21
+#define EXIT_REASON_VMPTRST            22
+#define EXIT_REASON_VMREAD             23
+#define EXIT_REASON_VMRESUME           24
+#define EXIT_REASON_VMWRITE            25
+#define EXIT_REASON_VMOFF              26
+#define EXIT_REASON_VMON               27
+#define EXIT_REASON_CR_ACCESS          28
+#define EXIT_REASON_DR_ACCESS          29
+#define EXIT_REASON_IO_INSTRUCTION     30
+#define EXIT_REASON_MSR_READ           31
+#define EXIT_REASON_MSR_WRITE          32
+#define EXIT_REASON_INVALID_STATE      33
+#define EXIT_REASON_MWAIT_INSTRUCTION  36
+#define EXIT_REASON_MONITOR_INSTRUCTION 39
+#define EXIT_REASON_PAUSE_INSTRUCTION  40
+#define EXIT_REASON_MCE_DURING_VMENTRY 41
+#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
+#define EXIT_REASON_APIC_ACCESS                44
+#define EXIT_REASON_EOI_INDUCED                45
+#define EXIT_REASON_EPT_VIOLATION      48
+#define EXIT_REASON_EPT_MISCONFIG      49
+#define EXIT_REASON_INVEPT             50
+#define EXIT_REASON_RDTSCP             51
+#define EXIT_REASON_PREEMPTION_TIMER   52
+#define EXIT_REASON_INVVPID            53
+#define EXIT_REASON_WBINVD             54
+#define EXIT_REASON_XSETBV             55
+#define EXIT_REASON_APIC_WRITE         56
+#define EXIT_REASON_INVPCID            58
+#define EXIT_REASON_PML_FULL           62
+#define EXIT_REASON_XSAVES             63
+#define EXIT_REASON_XRSTORS            64
+#define LAST_EXIT_REASON               64
+
+enum vmcs_field {
+       VIRTUAL_PROCESSOR_ID            = 0x00000000,
+       POSTED_INTR_NV                  = 0x00000002,
+       GUEST_ES_SELECTOR               = 0x00000800,
+       GUEST_CS_SELECTOR               = 0x00000802,
+       GUEST_SS_SELECTOR               = 0x00000804,
+       GUEST_DS_SELECTOR               = 0x00000806,
+       GUEST_FS_SELECTOR               = 0x00000808,
+       GUEST_GS_SELECTOR               = 0x0000080a,
+       GUEST_LDTR_SELECTOR             = 0x0000080c,
+       GUEST_TR_SELECTOR               = 0x0000080e,
+       GUEST_INTR_STATUS               = 0x00000810,
+       GUEST_PML_INDEX                 = 0x00000812,
+       HOST_ES_SELECTOR                = 0x00000c00,
+       HOST_CS_SELECTOR                = 0x00000c02,
+       HOST_SS_SELECTOR                = 0x00000c04,
+       HOST_DS_SELECTOR                = 0x00000c06,
+       HOST_FS_SELECTOR                = 0x00000c08,
+       HOST_GS_SELECTOR                = 0x00000c0a,
+       HOST_TR_SELECTOR                = 0x00000c0c,
+       IO_BITMAP_A                     = 0x00002000,
+       IO_BITMAP_A_HIGH                = 0x00002001,
+       IO_BITMAP_B                     = 0x00002002,
+       IO_BITMAP_B_HIGH                = 0x00002003,
+       MSR_BITMAP                      = 0x00002004,
+       MSR_BITMAP_HIGH                 = 0x00002005,
+       VM_EXIT_MSR_STORE_ADDR          = 0x00002006,
+       VM_EXIT_MSR_STORE_ADDR_HIGH     = 0x00002007,
+       VM_EXIT_MSR_LOAD_ADDR           = 0x00002008,
+       VM_EXIT_MSR_LOAD_ADDR_HIGH      = 0x00002009,
+       VM_ENTRY_MSR_LOAD_ADDR          = 0x0000200a,
+       VM_ENTRY_MSR_LOAD_ADDR_HIGH     = 0x0000200b,
+       PML_ADDRESS                     = 0x0000200e,
+       PML_ADDRESS_HIGH                = 0x0000200f,
+       TSC_OFFSET                      = 0x00002010,
+       TSC_OFFSET_HIGH                 = 0x00002011,
+       VIRTUAL_APIC_PAGE_ADDR          = 0x00002012,
+       VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
+       APIC_ACCESS_ADDR                = 0x00002014,
+       APIC_ACCESS_ADDR_HIGH           = 0x00002015,
+       POSTED_INTR_DESC_ADDR           = 0x00002016,
+       POSTED_INTR_DESC_ADDR_HIGH      = 0x00002017,
+       EPT_POINTER                     = 0x0000201a,
+       EPT_POINTER_HIGH                = 0x0000201b,
+       EOI_EXIT_BITMAP0                = 0x0000201c,
+       EOI_EXIT_BITMAP0_HIGH           = 0x0000201d,
+       EOI_EXIT_BITMAP1                = 0x0000201e,
+       EOI_EXIT_BITMAP1_HIGH           = 0x0000201f,
+       EOI_EXIT_BITMAP2                = 0x00002020,
+       EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
+       EOI_EXIT_BITMAP3                = 0x00002022,
+       EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
+       VMREAD_BITMAP                   = 0x00002026,
+       VMREAD_BITMAP_HIGH              = 0x00002027,
+       VMWRITE_BITMAP                  = 0x00002028,
+       VMWRITE_BITMAP_HIGH             = 0x00002029,
+       XSS_EXIT_BITMAP                 = 0x0000202C,
+       XSS_EXIT_BITMAP_HIGH            = 0x0000202D,
+       TSC_MULTIPLIER                  = 0x00002032,
+       TSC_MULTIPLIER_HIGH             = 0x00002033,
+       GUEST_PHYSICAL_ADDRESS          = 0x00002400,
+       GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
+       VMCS_LINK_POINTER               = 0x00002800,
+       VMCS_LINK_POINTER_HIGH          = 0x00002801,
+       GUEST_IA32_DEBUGCTL             = 0x00002802,
+       GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
+       GUEST_IA32_PAT                  = 0x00002804,
+       GUEST_IA32_PAT_HIGH             = 0x00002805,
+       GUEST_IA32_EFER                 = 0x00002806,
+       GUEST_IA32_EFER_HIGH            = 0x00002807,
+       GUEST_IA32_PERF_GLOBAL_CTRL     = 0x00002808,
+       GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809,
+       GUEST_PDPTR0                    = 0x0000280a,
+       GUEST_PDPTR0_HIGH               = 0x0000280b,
+       GUEST_PDPTR1                    = 0x0000280c,
+       GUEST_PDPTR1_HIGH               = 0x0000280d,
+       GUEST_PDPTR2                    = 0x0000280e,
+       GUEST_PDPTR2_HIGH               = 0x0000280f,
+       GUEST_PDPTR3                    = 0x00002810,
+       GUEST_PDPTR3_HIGH               = 0x00002811,
+       GUEST_BNDCFGS                   = 0x00002812,
+       GUEST_BNDCFGS_HIGH              = 0x00002813,
+       HOST_IA32_PAT                   = 0x00002c00,
+       HOST_IA32_PAT_HIGH              = 0x00002c01,
+       HOST_IA32_EFER                  = 0x00002c02,
+       HOST_IA32_EFER_HIGH             = 0x00002c03,
+       HOST_IA32_PERF_GLOBAL_CTRL      = 0x00002c04,
+       HOST_IA32_PERF_GLOBAL_CTRL_HIGH = 0x00002c05,
+       PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
+       CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
+       EXCEPTION_BITMAP                = 0x00004004,
+       PAGE_FAULT_ERROR_CODE_MASK      = 0x00004006,
+       PAGE_FAULT_ERROR_CODE_MATCH     = 0x00004008,
+       CR3_TARGET_COUNT                = 0x0000400a,
+       VM_EXIT_CONTROLS                = 0x0000400c,
+       VM_EXIT_MSR_STORE_COUNT         = 0x0000400e,
+       VM_EXIT_MSR_LOAD_COUNT          = 0x00004010,
+       VM_ENTRY_CONTROLS               = 0x00004012,
+       VM_ENTRY_MSR_LOAD_COUNT         = 0x00004014,
+       VM_ENTRY_INTR_INFO_FIELD        = 0x00004016,
+       VM_ENTRY_EXCEPTION_ERROR_CODE   = 0x00004018,
+       VM_ENTRY_INSTRUCTION_LEN        = 0x0000401a,
+       TPR_THRESHOLD                   = 0x0000401c,
+       SECONDARY_VM_EXEC_CONTROL       = 0x0000401e,
+       PLE_GAP                         = 0x00004020,
+       PLE_WINDOW                      = 0x00004022,
+       VM_INSTRUCTION_ERROR            = 0x00004400,
+       VM_EXIT_REASON                  = 0x00004402,
+       VM_EXIT_INTR_INFO               = 0x00004404,
+       VM_EXIT_INTR_ERROR_CODE         = 0x00004406,
+       IDT_VECTORING_INFO_FIELD        = 0x00004408,
+       IDT_VECTORING_ERROR_CODE        = 0x0000440a,
+       VM_EXIT_INSTRUCTION_LEN         = 0x0000440c,
+       VMX_INSTRUCTION_INFO            = 0x0000440e,
+       GUEST_ES_LIMIT                  = 0x00004800,
+       GUEST_CS_LIMIT                  = 0x00004802,
+       GUEST_SS_LIMIT                  = 0x00004804,
+       GUEST_DS_LIMIT                  = 0x00004806,
+       GUEST_FS_LIMIT                  = 0x00004808,
+       GUEST_GS_LIMIT                  = 0x0000480a,
+       GUEST_LDTR_LIMIT                = 0x0000480c,
+       GUEST_TR_LIMIT                  = 0x0000480e,
+       GUEST_GDTR_LIMIT                = 0x00004810,
+       GUEST_IDTR_LIMIT                = 0x00004812,
+       GUEST_ES_AR_BYTES               = 0x00004814,
+       GUEST_CS_AR_BYTES               = 0x00004816,
+       GUEST_SS_AR_BYTES               = 0x00004818,
+       GUEST_DS_AR_BYTES               = 0x0000481a,
+       GUEST_FS_AR_BYTES               = 0x0000481c,
+       GUEST_GS_AR_BYTES               = 0x0000481e,
+       GUEST_LDTR_AR_BYTES             = 0x00004820,
+       GUEST_TR_AR_BYTES               = 0x00004822,
+       GUEST_INTERRUPTIBILITY_INFO     = 0x00004824,
+       GUEST_ACTIVITY_STATE            = 0X00004826,
+       GUEST_SYSENTER_CS               = 0x0000482A,
+       VMX_PREEMPTION_TIMER_VALUE      = 0x0000482E,
+       HOST_IA32_SYSENTER_CS           = 0x00004c00,
+       CR0_GUEST_HOST_MASK             = 0x00006000,
+       CR4_GUEST_HOST_MASK             = 0x00006002,
+       CR0_READ_SHADOW                 = 0x00006004,
+       CR4_READ_SHADOW                 = 0x00006006,
+       CR3_TARGET_VALUE0               = 0x00006008,
+       CR3_TARGET_VALUE1               = 0x0000600a,
+       CR3_TARGET_VALUE2               = 0x0000600c,
+       CR3_TARGET_VALUE3               = 0x0000600e,
+       EXIT_QUALIFICATION              = 0x00006400,
+       GUEST_LINEAR_ADDRESS            = 0x0000640a,
+       GUEST_CR0                       = 0x00006800,
+       GUEST_CR3                       = 0x00006802,
+       GUEST_CR4                       = 0x00006804,
+       GUEST_ES_BASE                   = 0x00006806,
+       GUEST_CS_BASE                   = 0x00006808,
+       GUEST_SS_BASE                   = 0x0000680a,
+       GUEST_DS_BASE                   = 0x0000680c,
+       GUEST_FS_BASE                   = 0x0000680e,
+       GUEST_GS_BASE                   = 0x00006810,
+       GUEST_LDTR_BASE                 = 0x00006812,
+       GUEST_TR_BASE                   = 0x00006814,
+       GUEST_GDTR_BASE                 = 0x00006816,
+       GUEST_IDTR_BASE                 = 0x00006818,
+       GUEST_DR7                       = 0x0000681a,
+       GUEST_RSP                       = 0x0000681c,
+       GUEST_RIP                       = 0x0000681e,
+       GUEST_RFLAGS                    = 0x00006820,
+       GUEST_PENDING_DBG_EXCEPTIONS    = 0x00006822,
+       GUEST_SYSENTER_ESP              = 0x00006824,
+       GUEST_SYSENTER_EIP              = 0x00006826,
+       HOST_CR0                        = 0x00006c00,
+       HOST_CR3                        = 0x00006c02,
+       HOST_CR4                        = 0x00006c04,
+       HOST_FS_BASE                    = 0x00006c06,
+       HOST_GS_BASE                    = 0x00006c08,
+       HOST_TR_BASE                    = 0x00006c0a,
+       HOST_GDTR_BASE                  = 0x00006c0c,
+       HOST_IDTR_BASE                  = 0x00006c0e,
+       HOST_IA32_SYSENTER_ESP          = 0x00006c10,
+       HOST_IA32_SYSENTER_EIP          = 0x00006c12,
+       HOST_RSP                        = 0x00006c14,
+       HOST_RIP                        = 0x00006c16,
+};
+
+struct vmx_msr_entry {
+       uint32_t index;
+       uint32_t reserved;
+       uint64_t value;
+} __attribute__ ((aligned(16)));
+
+static inline int vmxon(uint64_t phys)
+{
+       uint8_t ret;
+
+       __asm__ __volatile__ ("vmxon %[pa]; setna %[ret]"
+               : [ret]"=rm"(ret)
+               : [pa]"m"(phys)
+               : "cc", "memory");
+
+       return ret;
+}
+
+static inline void vmxoff(void)
+{
+       __asm__ __volatile__("vmxoff");
+}
+
+static inline int vmclear(uint64_t vmcs_pa)
+{
+       uint8_t ret;
+
+       __asm__ __volatile__ ("vmclear %[pa]; setna %[ret]"
+               : [ret]"=rm"(ret)
+               : [pa]"m"(vmcs_pa)
+               : "cc", "memory");
+
+       return ret;
+}
+
+static inline int vmptrld(uint64_t vmcs_pa)
+{
+       uint8_t ret;
+
+       __asm__ __volatile__ ("vmptrld %[pa]; setna %[ret]"
+               : [ret]"=rm"(ret)
+               : [pa]"m"(vmcs_pa)
+               : "cc", "memory");
+
+       return ret;
+}
+
+/*
+ * No guest state (e.g. GPRs) is established by this vmlaunch.
+ */
+static inline int vmlaunch(void)
+{
+       int ret;
+
+       __asm__ __volatile__("push %%rbp;"
+                            "push %%rcx;"
+                            "push %%rdx;"
+                            "push %%rsi;"
+                            "push %%rdi;"
+                            "push $0;"
+                            "vmwrite %%rsp, %[host_rsp];"
+                            "lea 1f(%%rip), %%rax;"
+                            "vmwrite %%rax, %[host_rip];"
+                            "vmlaunch;"
+                            "incq (%%rsp);"
+                            "1: pop %%rax;"
+                            "pop %%rdi;"
+                            "pop %%rsi;"
+                            "pop %%rdx;"
+                            "pop %%rcx;"
+                            "pop %%rbp;"
+                            : [ret]"=&a"(ret)
+                            : [host_rsp]"r"((uint64_t)HOST_RSP),
+                              [host_rip]"r"((uint64_t)HOST_RIP)
+                            : "memory", "cc", "rbx", "r8", "r9", "r10",
+                              "r11", "r12", "r13", "r14", "r15");
+       return ret;
+}
+
+/*
+ * No guest state (e.g. GPRs) is established by this vmresume.
+ */
+static inline int vmresume(void)
+{
+       int ret;
+
+       __asm__ __volatile__("push %%rbp;"
+                            "push %%rcx;"
+                            "push %%rdx;"
+                            "push %%rsi;"
+                            "push %%rdi;"
+                            "push $0;"
+                            "vmwrite %%rsp, %[host_rsp];"
+                            "lea 1f(%%rip), %%rax;"
+                            "vmwrite %%rax, %[host_rip];"
+                            "vmresume;"
+                            "incq (%%rsp);"
+                            "1: pop %%rax;"
+                            "pop %%rdi;"
+                            "pop %%rsi;"
+                            "pop %%rdx;"
+                            "pop %%rcx;"
+                            "pop %%rbp;"
+                            : [ret]"=&a"(ret)
+                            : [host_rsp]"r"((uint64_t)HOST_RSP),
+                              [host_rip]"r"((uint64_t)HOST_RIP)
+                            : "memory", "cc", "rbx", "r8", "r9", "r10",
+                              "r11", "r12", "r13", "r14", "r15");
+       return ret;
+}
+
+static inline int vmread(uint64_t encoding, uint64_t *value)
+{
+       uint64_t tmp;
+       uint8_t ret;
+
+       __asm__ __volatile__("vmread %[encoding], %[value]; setna %[ret]"
+               : [value]"=rm"(tmp), [ret]"=rm"(ret)
+               : [encoding]"r"(encoding)
+               : "cc", "memory");
+
+       *value = tmp;
+       return ret;
+}
+
+/*
+ * A wrapper around vmread that ignores errors and returns zero if the
+ * vmread instruction fails.
+ */
+static inline uint64_t vmreadz(uint64_t encoding)
+{
+       uint64_t value = 0;
+       vmread(encoding, &value);
+       return value;
+}
+
+static inline int vmwrite(uint64_t encoding, uint64_t value)
+{
+       uint8_t ret;
+
+       __asm__ __volatile__ ("vmwrite %[value], %[encoding]; setna %[ret]"
+               : [ret]"=rm"(ret)
+               : [value]"rm"(value), [encoding]"r"(encoding)
+               : "cc", "memory");
+
+       return ret;
+}
+
+static inline uint32_t vmcs_revision(void)
+{
+       return rdmsr(MSR_IA32_VMX_BASIC);
+}
+
+void prepare_for_vmx_operation(void);
+void prepare_vmcs(void *guest_rip, void *guest_rsp);
+struct kvm_vm *vm_create_default_vmx(uint32_t vcpuid,
+                                    vmx_guest_code_t guest_code);
+
+#endif /* !SELFTEST_KVM_VMX_H */
index 7ca1bb4..2cedfda 100644 (file)
@@ -378,7 +378,7 @@ int kvm_memcmp_hva_gva(void *hva,
  * complicated. This function uses a reasonable default length for
  * the array and performs the appropriate allocation.
  */
-struct kvm_cpuid2 *allocate_kvm_cpuid2(void)
+static struct kvm_cpuid2 *allocate_kvm_cpuid2(void)
 {
        struct kvm_cpuid2 *cpuid;
        int nent = 100;
@@ -402,17 +402,21 @@ struct kvm_cpuid2 *allocate_kvm_cpuid2(void)
  * Input Args: None
  *
  * Output Args:
- *   cpuid - The supported KVM CPUID
  *
- * Return: void
+ * Return: The supported KVM CPUID
  *
  * Get the guest CPUID supported by KVM.
  */
-void kvm_get_supported_cpuid(struct kvm_cpuid2 *cpuid)
+struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
 {
+       static struct kvm_cpuid2 *cpuid;
        int ret;
        int kvm_fd;
 
+       if (cpuid)
+               return cpuid;
+
+       cpuid = allocate_kvm_cpuid2();
        kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
        TEST_ASSERT(kvm_fd >= 0, "open %s failed, rc: %i errno: %i",
                KVM_DEV_PATH, kvm_fd, errno);
@@ -422,6 +426,7 @@ void kvm_get_supported_cpuid(struct kvm_cpuid2 *cpuid)
                    ret, errno);
 
        close(kvm_fd);
+       return cpuid;
 }
 
 /* Locate a cpuid entry.
@@ -435,12 +440,13 @@ void kvm_get_supported_cpuid(struct kvm_cpuid2 *cpuid)
  * Return: A pointer to the cpuid entry. Never returns NULL.
  */
 struct kvm_cpuid_entry2 *
-find_cpuid_index_entry(struct kvm_cpuid2 *cpuid, uint32_t function,
-                      uint32_t index)
+kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
 {
+       struct kvm_cpuid2 *cpuid;
        struct kvm_cpuid_entry2 *entry = NULL;
        int i;
 
+       cpuid = kvm_get_supported_cpuid();
        for (i = 0; i < cpuid->nent; i++) {
                if (cpuid->entries[i].function == function &&
                    cpuid->entries[i].index == index) {
@@ -1435,7 +1441,7 @@ vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm,
        sparsebit_idx_t pg;
 
        TEST_ASSERT((paddr_min % vm->page_size) == 0, "Min physical address "
-               "not divisable by page size.\n"
+               "not divisible by page size.\n"
                "  paddr_min: 0x%lx page_size: 0x%x",
                paddr_min, vm->page_size);
 
index 0c5cf3e..b132bc9 100644 (file)
  *     avoided by moving the setting of the nodes mask bits into
  *     the previous nodes num_after setting.
  *
- *   + Node starting index is evenly divisable by the number of bits
+ *   + Node starting index is evenly divisible by the number of bits
  *     within a nodes mask member.
  *
  *   + Nodes never represent a range of bits that wrap around the
@@ -1741,7 +1741,7 @@ void sparsebit_validate_internal(struct sparsebit *s)
 
                /* Validate node index is divisible by the mask size */
                if (nodep->idx % MASK_BITS) {
-                       fprintf(stderr, "Node index not divisable by "
+                       fprintf(stderr, "Node index not divisible by "
                                "mask size,\n"
                                "  nodep: %p nodep->idx: 0x%lx "
                                "MASK_BITS: %lu\n",
diff --git a/tools/testing/selftests/kvm/lib/vmx.c b/tools/testing/selftests/kvm/lib/vmx.c
new file mode 100644 (file)
index 0000000..0231bc0
--- /dev/null
@@ -0,0 +1,243 @@
+/*
+ * tools/testing/selftests/kvm/lib/x86.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "x86.h"
+#include "vmx.h"
+
+/* Create a default VM for VMX tests.
+ *
+ * Input Args:
+ *   vcpuid - The id of the single VCPU to add to the VM.
+ *   guest_code - The vCPU's entry point
+ *
+ * Output Args: None
+ *
+ * Return:
+ *   Pointer to opaque structure that describes the created VM.
+ */
+struct kvm_vm *
+vm_create_default_vmx(uint32_t vcpuid, vmx_guest_code_t guest_code)
+{
+       struct kvm_cpuid2 *cpuid;
+       struct kvm_vm *vm;
+       vm_vaddr_t vmxon_vaddr;
+       vm_paddr_t vmxon_paddr;
+       vm_vaddr_t vmcs_vaddr;
+       vm_paddr_t vmcs_paddr;
+
+       vm = vm_create_default(vcpuid, (void *) guest_code);
+
+       /* Enable nesting in CPUID */
+       vcpu_set_cpuid(vm, vcpuid, kvm_get_supported_cpuid());
+
+       /* Setup of a region of guest memory for the vmxon region. */
+       vmxon_vaddr = vm_vaddr_alloc(vm, getpagesize(), 0, 0, 0);
+       vmxon_paddr = addr_gva2gpa(vm, vmxon_vaddr);
+
+       /* Setup of a region of guest memory for a vmcs. */
+       vmcs_vaddr = vm_vaddr_alloc(vm, getpagesize(), 0, 0, 0);
+       vmcs_paddr = addr_gva2gpa(vm, vmcs_vaddr);
+
+       vcpu_args_set(vm, vcpuid, 4, vmxon_vaddr, vmxon_paddr, vmcs_vaddr,
+                     vmcs_paddr);
+
+       return vm;
+}
+
+void prepare_for_vmx_operation(void)
+{
+       uint64_t feature_control;
+       uint64_t required;
+       unsigned long cr0;
+       unsigned long cr4;
+
+       /*
+        * Ensure bits in CR0 and CR4 are valid in VMX operation:
+        * - Bit X is 1 in _FIXED0: bit X is fixed to 1 in CRx.
+        * - Bit X is 0 in _FIXED1: bit X is fixed to 0 in CRx.
+        */
+       __asm__ __volatile__("mov %%cr0, %0" : "=r"(cr0) : : "memory");
+       cr0 &= rdmsr(MSR_IA32_VMX_CR0_FIXED1);
+       cr0 |= rdmsr(MSR_IA32_VMX_CR0_FIXED0);
+       __asm__ __volatile__("mov %0, %%cr0" : : "r"(cr0) : "memory");
+
+       __asm__ __volatile__("mov %%cr4, %0" : "=r"(cr4) : : "memory");
+       cr4 &= rdmsr(MSR_IA32_VMX_CR4_FIXED1);
+       cr4 |= rdmsr(MSR_IA32_VMX_CR4_FIXED0);
+       /* Enable VMX operation */
+       cr4 |= X86_CR4_VMXE;
+       __asm__ __volatile__("mov %0, %%cr4" : : "r"(cr4) : "memory");
+
+       /*
+        * Configure IA32_FEATURE_CONTROL MSR to allow VMXON:
+        *  Bit 0: Lock bit. If clear, VMXON causes a #GP.
+        *  Bit 2: Enables VMXON outside of SMX operation. If clear, VMXON
+        *    outside of SMX causes a #GP.
+        */
+       required = FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+       required |= FEATURE_CONTROL_LOCKED;
+       feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
+       if ((feature_control & required) != required)
+               wrmsr(MSR_IA32_FEATURE_CONTROL, feature_control | required);
+}
+
+/*
+ * Initialize the control fields to the most basic settings possible.
+ */
+static inline void init_vmcs_control_fields(void)
+{
+       vmwrite(VIRTUAL_PROCESSOR_ID, 0);
+       vmwrite(POSTED_INTR_NV, 0);
+
+       vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_PINBASED_CTLS));
+       vmwrite(CPU_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_PROCBASED_CTLS));
+       vmwrite(EXCEPTION_BITMAP, 0);
+       vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
+       vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, -1); /* Never match */
+       vmwrite(CR3_TARGET_COUNT, 0);
+       vmwrite(VM_EXIT_CONTROLS, rdmsr(MSR_IA32_VMX_EXIT_CTLS) |
+               VM_EXIT_HOST_ADDR_SPACE_SIZE);    /* 64-bit host */
+       vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
+       vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
+       vmwrite(VM_ENTRY_CONTROLS, rdmsr(MSR_IA32_VMX_ENTRY_CTLS) |
+               VM_ENTRY_IA32E_MODE);             /* 64-bit guest */
+       vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
+       vmwrite(VM_ENTRY_INTR_INFO_FIELD, 0);
+       vmwrite(TPR_THRESHOLD, 0);
+       vmwrite(SECONDARY_VM_EXEC_CONTROL, 0);
+
+       vmwrite(CR0_GUEST_HOST_MASK, 0);
+       vmwrite(CR4_GUEST_HOST_MASK, 0);
+       vmwrite(CR0_READ_SHADOW, get_cr0());
+       vmwrite(CR4_READ_SHADOW, get_cr4());
+}
+
+/*
+ * Initialize the host state fields based on the current host state, with
+ * the exception of HOST_RSP and HOST_RIP, which should be set by vmlaunch
+ * or vmresume.
+ */
+static inline void init_vmcs_host_state(void)
+{
+       uint32_t exit_controls = vmreadz(VM_EXIT_CONTROLS);
+
+       vmwrite(HOST_ES_SELECTOR, get_es());
+       vmwrite(HOST_CS_SELECTOR, get_cs());
+       vmwrite(HOST_SS_SELECTOR, get_ss());
+       vmwrite(HOST_DS_SELECTOR, get_ds());
+       vmwrite(HOST_FS_SELECTOR, get_fs());
+       vmwrite(HOST_GS_SELECTOR, get_gs());
+       vmwrite(HOST_TR_SELECTOR, get_tr());
+
+       if (exit_controls & VM_EXIT_LOAD_IA32_PAT)
+               vmwrite(HOST_IA32_PAT, rdmsr(MSR_IA32_CR_PAT));
+       if (exit_controls & VM_EXIT_LOAD_IA32_EFER)
+               vmwrite(HOST_IA32_EFER, rdmsr(MSR_EFER));
+       if (exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+               vmwrite(HOST_IA32_PERF_GLOBAL_CTRL,
+                       rdmsr(MSR_CORE_PERF_GLOBAL_CTRL));
+
+       vmwrite(HOST_IA32_SYSENTER_CS, rdmsr(MSR_IA32_SYSENTER_CS));
+
+       vmwrite(HOST_CR0, get_cr0());
+       vmwrite(HOST_CR3, get_cr3());
+       vmwrite(HOST_CR4, get_cr4());
+       vmwrite(HOST_FS_BASE, rdmsr(MSR_FS_BASE));
+       vmwrite(HOST_GS_BASE, rdmsr(MSR_GS_BASE));
+       vmwrite(HOST_TR_BASE,
+               get_desc64_base((struct desc64 *)(get_gdt_base() + get_tr())));
+       vmwrite(HOST_GDTR_BASE, get_gdt_base());
+       vmwrite(HOST_IDTR_BASE, get_idt_base());
+       vmwrite(HOST_IA32_SYSENTER_ESP, rdmsr(MSR_IA32_SYSENTER_ESP));
+       vmwrite(HOST_IA32_SYSENTER_EIP, rdmsr(MSR_IA32_SYSENTER_EIP));
+}
+
+/*
+ * Initialize the guest state fields essentially as a clone of
+ * the host state fields. Some host state fields have fixed
+ * values, and we set the corresponding guest state fields accordingly.
+ */
+static inline void init_vmcs_guest_state(void *rip, void *rsp)
+{
+       vmwrite(GUEST_ES_SELECTOR, vmreadz(HOST_ES_SELECTOR));
+       vmwrite(GUEST_CS_SELECTOR, vmreadz(HOST_CS_SELECTOR));
+       vmwrite(GUEST_SS_SELECTOR, vmreadz(HOST_SS_SELECTOR));
+       vmwrite(GUEST_DS_SELECTOR, vmreadz(HOST_DS_SELECTOR));
+       vmwrite(GUEST_FS_SELECTOR, vmreadz(HOST_FS_SELECTOR));
+       vmwrite(GUEST_GS_SELECTOR, vmreadz(HOST_GS_SELECTOR));
+       vmwrite(GUEST_LDTR_SELECTOR, 0);
+       vmwrite(GUEST_TR_SELECTOR, vmreadz(HOST_TR_SELECTOR));
+       vmwrite(GUEST_INTR_STATUS, 0);
+       vmwrite(GUEST_PML_INDEX, 0);
+
+       vmwrite(VMCS_LINK_POINTER, -1ll);
+       vmwrite(GUEST_IA32_DEBUGCTL, 0);
+       vmwrite(GUEST_IA32_PAT, vmreadz(HOST_IA32_PAT));
+       vmwrite(GUEST_IA32_EFER, vmreadz(HOST_IA32_EFER));
+       vmwrite(GUEST_IA32_PERF_GLOBAL_CTRL,
+               vmreadz(HOST_IA32_PERF_GLOBAL_CTRL));
+
+       vmwrite(GUEST_ES_LIMIT, -1);
+       vmwrite(GUEST_CS_LIMIT, -1);
+       vmwrite(GUEST_SS_LIMIT, -1);
+       vmwrite(GUEST_DS_LIMIT, -1);
+       vmwrite(GUEST_FS_LIMIT, -1);
+       vmwrite(GUEST_GS_LIMIT, -1);
+       vmwrite(GUEST_LDTR_LIMIT, -1);
+       vmwrite(GUEST_TR_LIMIT, 0x67);
+       vmwrite(GUEST_GDTR_LIMIT, 0xffff);
+       vmwrite(GUEST_IDTR_LIMIT, 0xffff);
+       vmwrite(GUEST_ES_AR_BYTES,
+               vmreadz(GUEST_ES_SELECTOR) == 0 ? 0x10000 : 0xc093);
+       vmwrite(GUEST_CS_AR_BYTES, 0xa09b);
+       vmwrite(GUEST_SS_AR_BYTES, 0xc093);
+       vmwrite(GUEST_DS_AR_BYTES,
+               vmreadz(GUEST_DS_SELECTOR) == 0 ? 0x10000 : 0xc093);
+       vmwrite(GUEST_FS_AR_BYTES,
+               vmreadz(GUEST_FS_SELECTOR) == 0 ? 0x10000 : 0xc093);
+       vmwrite(GUEST_GS_AR_BYTES,
+               vmreadz(GUEST_GS_SELECTOR) == 0 ? 0x10000 : 0xc093);
+       vmwrite(GUEST_LDTR_AR_BYTES, 0x10000);
+       vmwrite(GUEST_TR_AR_BYTES, 0x8b);
+       vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
+       vmwrite(GUEST_ACTIVITY_STATE, 0);
+       vmwrite(GUEST_SYSENTER_CS, vmreadz(HOST_IA32_SYSENTER_CS));
+       vmwrite(VMX_PREEMPTION_TIMER_VALUE, 0);
+
+       vmwrite(GUEST_CR0, vmreadz(HOST_CR0));
+       vmwrite(GUEST_CR3, vmreadz(HOST_CR3));
+       vmwrite(GUEST_CR4, vmreadz(HOST_CR4));
+       vmwrite(GUEST_ES_BASE, 0);
+       vmwrite(GUEST_CS_BASE, 0);
+       vmwrite(GUEST_SS_BASE, 0);
+       vmwrite(GUEST_DS_BASE, 0);
+       vmwrite(GUEST_FS_BASE, vmreadz(HOST_FS_BASE));
+       vmwrite(GUEST_GS_BASE, vmreadz(HOST_GS_BASE));
+       vmwrite(GUEST_LDTR_BASE, 0);
+       vmwrite(GUEST_TR_BASE, vmreadz(HOST_TR_BASE));
+       vmwrite(GUEST_GDTR_BASE, vmreadz(HOST_GDTR_BASE));
+       vmwrite(GUEST_IDTR_BASE, vmreadz(HOST_IDTR_BASE));
+       vmwrite(GUEST_DR7, 0x400);
+       vmwrite(GUEST_RSP, (uint64_t)rsp);
+       vmwrite(GUEST_RIP, (uint64_t)rip);
+       vmwrite(GUEST_RFLAGS, 2);
+       vmwrite(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+       vmwrite(GUEST_SYSENTER_ESP, vmreadz(HOST_IA32_SYSENTER_ESP));
+       vmwrite(GUEST_SYSENTER_EIP, vmreadz(HOST_IA32_SYSENTER_EIP));
+}
+
+void prepare_vmcs(void *guest_rip, void *guest_rsp)
+{
+       init_vmcs_control_fields();
+       init_vmcs_host_state();
+       init_vmcs_guest_state(guest_rip, guest_rsp);
+}
diff --git a/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c
new file mode 100644 (file)
index 0000000..8f7f620
--- /dev/null
@@ -0,0 +1,231 @@
+/*
+ * gtests/tests/vmx_tsc_adjust_test.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ *
+ * IA32_TSC_ADJUST test
+ *
+ * According to the SDM, "if an execution of WRMSR to the
+ * IA32_TIME_STAMP_COUNTER MSR adds (or subtracts) value X from the TSC,
+ * the logical processor also adds (or subtracts) value X from the
+ * IA32_TSC_ADJUST MSR.
+ *
+ * Note that when L1 doesn't intercept writes to IA32_TSC, a
+ * WRMSR(IA32_TSC) from L2 sets L1's TSC value, not L2's perceived TSC
+ * value.
+ *
+ * This test verifies that this unusual case is handled correctly.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "x86.h"
+#include "vmx.h"
+
+#include <string.h>
+#include <sys/ioctl.h>
+
+#ifndef MSR_IA32_TSC_ADJUST
+#define MSR_IA32_TSC_ADJUST 0x3b
+#endif
+
+#define PAGE_SIZE      4096
+#define VCPU_ID                5
+
+#define TSC_ADJUST_VALUE (1ll << 32)
+#define TSC_OFFSET_VALUE -(1ll << 48)
+
+enum {
+       PORT_ABORT = 0x1000,
+       PORT_REPORT,
+       PORT_DONE,
+};
+
+struct vmx_page {
+       vm_vaddr_t virt;
+       vm_paddr_t phys;
+};
+
+enum {
+       VMXON_PAGE = 0,
+       VMCS_PAGE,
+       MSR_BITMAP_PAGE,
+
+       NUM_VMX_PAGES,
+};
+
+struct kvm_single_msr {
+       struct kvm_msrs header;
+       struct kvm_msr_entry entry;
+} __attribute__((packed));
+
+/* The virtual machine object. */
+static struct kvm_vm *vm;
+
+/* Array of vmx_page descriptors that is shared with the guest. */
+struct vmx_page *vmx_pages;
+
+#define exit_to_l0(_port, _arg) do_exit_to_l0(_port, (unsigned long) (_arg))
+static void do_exit_to_l0(uint16_t port, unsigned long arg)
+{
+       __asm__ __volatile__("in %[port], %%al"
+               :
+               : [port]"d"(port), "D"(arg)
+               : "rax");
+}
+
+
+#define GUEST_ASSERT(_condition) do {                                       \
+       if (!(_condition))                                                   \
+               exit_to_l0(PORT_ABORT, "Failed guest assert: " #_condition); \
+} while (0)
+
+static void check_ia32_tsc_adjust(int64_t max)
+{
+       int64_t adjust;
+
+       adjust = rdmsr(MSR_IA32_TSC_ADJUST);
+       exit_to_l0(PORT_REPORT, adjust);
+       GUEST_ASSERT(adjust <= max);
+}
+
+static void l2_guest_code(void)
+{
+       uint64_t l1_tsc = rdtsc() - TSC_OFFSET_VALUE;
+
+       wrmsr(MSR_IA32_TSC, l1_tsc - TSC_ADJUST_VALUE);
+       check_ia32_tsc_adjust(-2 * TSC_ADJUST_VALUE);
+
+       /* Exit to L1 */
+       __asm__ __volatile__("vmcall");
+}
+
+static void l1_guest_code(struct vmx_page *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+       unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+       uint32_t control;
+       uintptr_t save_cr3;
+
+       GUEST_ASSERT(rdtsc() < TSC_ADJUST_VALUE);
+       wrmsr(MSR_IA32_TSC, rdtsc() - TSC_ADJUST_VALUE);
+       check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE);
+
+       prepare_for_vmx_operation();
+
+       /* Enter VMX root operation. */
+       *(uint32_t *)vmx_pages[VMXON_PAGE].virt = vmcs_revision();
+       GUEST_ASSERT(!vmxon(vmx_pages[VMXON_PAGE].phys));
+
+       /* Load a VMCS. */
+       *(uint32_t *)vmx_pages[VMCS_PAGE].virt = vmcs_revision();
+       GUEST_ASSERT(!vmclear(vmx_pages[VMCS_PAGE].phys));
+       GUEST_ASSERT(!vmptrld(vmx_pages[VMCS_PAGE].phys));
+
+       /* Prepare the VMCS for L2 execution. */
+       prepare_vmcs(l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+       control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
+       control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETING;
+       vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
+       vmwrite(MSR_BITMAP, vmx_pages[MSR_BITMAP_PAGE].phys);
+       vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE);
+
+       /* Jump into L2.  First, test failure to load guest CR3.  */
+       save_cr3 = vmreadz(GUEST_CR3);
+       vmwrite(GUEST_CR3, -1ull);
+       GUEST_ASSERT(!vmlaunch());
+       GUEST_ASSERT(vmreadz(VM_EXIT_REASON) ==
+                    (EXIT_REASON_FAILED_VMENTRY | EXIT_REASON_INVALID_STATE));
+       check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE);
+       vmwrite(GUEST_CR3, save_cr3);
+
+       GUEST_ASSERT(!vmlaunch());
+       GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+       check_ia32_tsc_adjust(-2 * TSC_ADJUST_VALUE);
+
+       exit_to_l0(PORT_DONE, 0);
+}
+
+static void allocate_vmx_page(struct vmx_page *page)
+{
+       vm_vaddr_t virt;
+
+       virt = vm_vaddr_alloc(vm, PAGE_SIZE, 0, 0, 0);
+       memset(addr_gva2hva(vm, virt), 0, PAGE_SIZE);
+
+       page->virt = virt;
+       page->phys = addr_gva2gpa(vm, virt);
+}
+
+static vm_vaddr_t allocate_vmx_pages(void)
+{
+       vm_vaddr_t vmx_pages_vaddr;
+       int i;
+
+       vmx_pages_vaddr = vm_vaddr_alloc(
+               vm, sizeof(struct vmx_page) * NUM_VMX_PAGES, 0, 0, 0);
+
+       vmx_pages = (void *) addr_gva2hva(vm, vmx_pages_vaddr);
+
+       for (i = 0; i < NUM_VMX_PAGES; i++)
+               allocate_vmx_page(&vmx_pages[i]);
+
+       return vmx_pages_vaddr;
+}
+
+void report(int64_t val)
+{
+       printf("IA32_TSC_ADJUST is %ld (%lld * TSC_ADJUST_VALUE + %lld).\n",
+              val, val / TSC_ADJUST_VALUE, val % TSC_ADJUST_VALUE);
+}
+
+int main(int argc, char *argv[])
+{
+       vm_vaddr_t vmx_pages_vaddr;
+       struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1);
+
+       if (!(entry->ecx & CPUID_VMX)) {
+               printf("nested VMX not enabled, skipping test");
+               return 0;
+       }
+
+       vm = vm_create_default_vmx(VCPU_ID, (void *) l1_guest_code);
+
+       /* Allocate VMX pages and shared descriptors (vmx_pages). */
+       vmx_pages_vaddr = allocate_vmx_pages();
+       vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_vaddr);
+
+       for (;;) {
+               volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+               struct kvm_regs regs;
+
+               vcpu_run(vm, VCPU_ID);
+               TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+                           "Got exit_reason other than KVM_EXIT_IO: %u (%s),\n",
+                           run->exit_reason,
+                           exit_reason_str(run->exit_reason));
+
+               vcpu_regs_get(vm, VCPU_ID, &regs);
+
+               switch (run->io.port) {
+               case PORT_ABORT:
+                       TEST_ASSERT(false, "%s", (const char *) regs.rdi);
+                       /* NOT REACHED */
+               case PORT_REPORT:
+                       report(regs.rdi);
+                       break;
+               case PORT_DONE:
+                       goto done;
+               default:
+                       TEST_ASSERT(false, "Unknown port 0x%x.", run->io.port);
+               }
+       }
+
+       kvm_vm_free(vm);
+done:
+       return 0;
+}