KVM: selftests: x86: Set supported CPUIDs on default VM
[linux-2.6-microblaze.git] / tools / testing / selftests / kvm / dirty_log_test.c
index 9215b26..471baec 100644 (file)
 #include <unistd.h>
 #include <time.h>
 #include <pthread.h>
+#include <semaphore.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <errno.h>
 #include <linux/bitmap.h>
 #include <linux/bitops.h>
+#include <asm/barrier.h>
 
 #include "test_util.h"
 #include "kvm_util.h"
 # define test_and_clear_bit_le test_and_clear_bit
 #endif
 
+#define TEST_DIRTY_RING_COUNT          65536
+
+#define SIG_IPI SIGUSR1
+
 /*
  * Guest/Host shared variables. Ensure addr_gva2hva() and/or
  * sync_global_to/from_guest() are used when accessing from
@@ -128,6 +137,31 @@ static uint64_t host_dirty_count;
 static uint64_t host_clear_count;
 static uint64_t host_track_next_count;
 
+/* Whether dirty ring reset is requested, or finished */
+static sem_t dirty_ring_vcpu_stop;
+static sem_t dirty_ring_vcpu_cont;
+/*
+ * This is updated by the vcpu thread to tell the host whether it's a
+ * ring-full event.  It should only be read until a sem_wait() of
+ * dirty_ring_vcpu_stop and before vcpu continues to run.
+ */
+static bool dirty_ring_vcpu_ring_full;
+/*
+ * This is only used for verifying the dirty pages.  Dirty ring has a very
+ * tricky case when the ring just got full, kvm will do userspace exit due to
+ * ring full.  When that happens, the very last PFN is set but actually the
+ * data is not changed (the guest WRITE is not really applied yet), because
+ * we found that the dirty ring is full, refused to continue the vcpu, and
+ * recorded the dirty gfn with the old contents.
+ *
+ * For this specific case, it's safe to skip checking this pfn for this
+ * bit, because it's a redundant bit, and when the write happens later the bit
+ * will be set again.  We use this variable to always keep track of the latest
+ * dirty gfn we've collected, so that if a mismatch of data found later in the
+ * verifying process, we let it pass.
+ */
+static uint64_t dirty_ring_last_page;
+
 enum log_mode_t {
        /* Only use KVM_GET_DIRTY_LOG for logging */
        LOG_MODE_DIRTY_LOG = 0,
@@ -135,6 +169,9 @@ enum log_mode_t {
        /* Use both KVM_[GET|CLEAR]_DIRTY_LOG for logging */
        LOG_MODE_CLEAR_LOG = 1,
 
+       /* Use dirty ring for logging */
+       LOG_MODE_DIRTY_RING = 2,
+
        LOG_MODE_NUM,
 
        /* Run all supported modes */
@@ -145,6 +182,26 @@ enum log_mode_t {
 static enum log_mode_t host_log_mode_option = LOG_MODE_ALL;
 /* Logging mode for current run */
 static enum log_mode_t host_log_mode;
+static pthread_t vcpu_thread;
+static uint32_t test_dirty_ring_count = TEST_DIRTY_RING_COUNT;
+
+static void vcpu_kick(void)
+{
+       pthread_kill(vcpu_thread, SIG_IPI);
+}
+
+/*
+ * In our test we do signal tricks, let's use a better version of
+ * sem_wait to avoid signal interrupts
+ */
+static void sem_wait_until(sem_t *sem)
+{
+       int ret;
+
+       do
+               ret = sem_wait(sem);
+       while (ret == -1 && errno == EINTR);
+}
 
 static bool clear_log_supported(void)
 {
@@ -178,15 +235,152 @@ static void clear_log_collect_dirty_pages(struct kvm_vm *vm, int slot,
        kvm_vm_clear_dirty_log(vm, slot, bitmap, 0, num_pages);
 }
 
-static void default_after_vcpu_run(struct kvm_vm *vm)
+static void default_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
 {
        struct kvm_run *run = vcpu_state(vm, VCPU_ID);
 
+       TEST_ASSERT(ret == 0 || (ret == -1 && err == EINTR),
+                   "vcpu run failed: errno=%d", err);
+
        TEST_ASSERT(get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC,
                    "Invalid guest sync status: exit_reason=%s\n",
                    exit_reason_str(run->exit_reason));
 }
 
+static bool dirty_ring_supported(void)
+{
+       return kvm_check_cap(KVM_CAP_DIRTY_LOG_RING);
+}
+
+static void dirty_ring_create_vm_done(struct kvm_vm *vm)
+{
+       /*
+        * Switch to dirty ring mode after VM creation but before any
+        * of the vcpu creation.
+        */
+       vm_enable_dirty_ring(vm, test_dirty_ring_count *
+                            sizeof(struct kvm_dirty_gfn));
+}
+
+static inline bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
+{
+       return gfn->flags == KVM_DIRTY_GFN_F_DIRTY;
+}
+
+static inline void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
+{
+       gfn->flags = KVM_DIRTY_GFN_F_RESET;
+}
+
+static uint32_t dirty_ring_collect_one(struct kvm_dirty_gfn *dirty_gfns,
+                                      int slot, void *bitmap,
+                                      uint32_t num_pages, uint32_t *fetch_index)
+{
+       struct kvm_dirty_gfn *cur;
+       uint32_t count = 0;
+
+       while (true) {
+               cur = &dirty_gfns[*fetch_index % test_dirty_ring_count];
+               if (!dirty_gfn_is_dirtied(cur))
+                       break;
+               TEST_ASSERT(cur->slot == slot, "Slot number didn't match: "
+                           "%u != %u", cur->slot, slot);
+               TEST_ASSERT(cur->offset < num_pages, "Offset overflow: "
+                           "0x%llx >= 0x%x", cur->offset, num_pages);
+               //pr_info("fetch 0x%x page %llu\n", *fetch_index, cur->offset);
+               set_bit_le(cur->offset, bitmap);
+               dirty_ring_last_page = cur->offset;
+               dirty_gfn_set_collected(cur);
+               (*fetch_index)++;
+               count++;
+       }
+
+       return count;
+}
+
+static void dirty_ring_wait_vcpu(void)
+{
+       /* This makes sure that hardware PML cache flushed */
+       vcpu_kick();
+       sem_wait_until(&dirty_ring_vcpu_stop);
+}
+
+static void dirty_ring_continue_vcpu(void)
+{
+       pr_info("Notifying vcpu to continue\n");
+       sem_post(&dirty_ring_vcpu_cont);
+}
+
+static void dirty_ring_collect_dirty_pages(struct kvm_vm *vm, int slot,
+                                          void *bitmap, uint32_t num_pages)
+{
+       /* We only have one vcpu */
+       static uint32_t fetch_index = 0;
+       uint32_t count = 0, cleared;
+       bool continued_vcpu = false;
+
+       dirty_ring_wait_vcpu();
+
+       if (!dirty_ring_vcpu_ring_full) {
+               /*
+                * This is not a ring-full event, it's safe to allow
+                * vcpu to continue
+                */
+               dirty_ring_continue_vcpu();
+               continued_vcpu = true;
+       }
+
+       /* Only have one vcpu */
+       count = dirty_ring_collect_one(vcpu_map_dirty_ring(vm, VCPU_ID),
+                                      slot, bitmap, num_pages, &fetch_index);
+
+       cleared = kvm_vm_reset_dirty_ring(vm);
+
+       /* Cleared pages should be the same as collected */
+       TEST_ASSERT(cleared == count, "Reset dirty pages (%u) mismatch "
+                   "with collected (%u)", cleared, count);
+
+       if (!continued_vcpu) {
+               TEST_ASSERT(dirty_ring_vcpu_ring_full,
+                           "Didn't continue vcpu even without ring full");
+               dirty_ring_continue_vcpu();
+       }
+
+       pr_info("Iteration %ld collected %u pages\n", iteration, count);
+}
+
+static void dirty_ring_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
+{
+       struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+
+       /* A ucall-sync or ring-full event is allowed */
+       if (get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC) {
+               /* We should allow this to continue */
+               ;
+       } else if (run->exit_reason == KVM_EXIT_DIRTY_RING_FULL ||
+                  (ret == -1 && err == EINTR)) {
+               /* Update the flag first before pause */
+               WRITE_ONCE(dirty_ring_vcpu_ring_full,
+                          run->exit_reason == KVM_EXIT_DIRTY_RING_FULL);
+               sem_post(&dirty_ring_vcpu_stop);
+               pr_info("vcpu stops because %s...\n",
+                       dirty_ring_vcpu_ring_full ?
+                       "dirty ring is full" : "vcpu is kicked out");
+               sem_wait_until(&dirty_ring_vcpu_cont);
+               pr_info("vcpu continues now.\n");
+       } else {
+               TEST_ASSERT(false, "Invalid guest sync status: "
+                           "exit_reason=%s\n",
+                           exit_reason_str(run->exit_reason));
+       }
+}
+
+static void dirty_ring_before_vcpu_join(void)
+{
+       /* Kick another round of vcpu just to make sure it will quit */
+       sem_post(&dirty_ring_vcpu_cont);
+}
+
 struct log_mode {
        const char *name;
        /* Return true if this mode is supported, otherwise false */
@@ -197,7 +391,8 @@ struct log_mode {
        void (*collect_dirty_pages) (struct kvm_vm *vm, int slot,
                                     void *bitmap, uint32_t num_pages);
        /* Hook to call when after each vcpu run */
-       void (*after_vcpu_run)(struct kvm_vm *vm);
+       void (*after_vcpu_run)(struct kvm_vm *vm, int ret, int err);
+       void (*before_vcpu_join) (void);
 } log_modes[LOG_MODE_NUM] = {
        {
                .name = "dirty-log",
@@ -211,6 +406,14 @@ struct log_mode {
                .collect_dirty_pages = clear_log_collect_dirty_pages,
                .after_vcpu_run = default_after_vcpu_run,
        },
+       {
+               .name = "dirty-ring",
+               .supported = dirty_ring_supported,
+               .create_vm_done = dirty_ring_create_vm_done,
+               .collect_dirty_pages = dirty_ring_collect_dirty_pages,
+               .before_vcpu_join = dirty_ring_before_vcpu_join,
+               .after_vcpu_run = dirty_ring_after_vcpu_run,
+       },
 };
 
 /*
@@ -260,12 +463,20 @@ static void log_mode_collect_dirty_pages(struct kvm_vm *vm, int slot,
        mode->collect_dirty_pages(vm, slot, bitmap, num_pages);
 }
 
-static void log_mode_after_vcpu_run(struct kvm_vm *vm)
+static void log_mode_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
 {
        struct log_mode *mode = &log_modes[host_log_mode];
 
        if (mode->after_vcpu_run)
-               mode->after_vcpu_run(vm);
+               mode->after_vcpu_run(vm, ret, err);
+}
+
+static void log_mode_before_vcpu_join(void)
+{
+       struct log_mode *mode = &log_modes[host_log_mode];
+
+       if (mode->before_vcpu_join)
+               mode->before_vcpu_join();
 }
 
 static void generate_random_array(uint64_t *guest_array, uint64_t size)
@@ -278,20 +489,44 @@ static void generate_random_array(uint64_t *guest_array, uint64_t size)
 
 static void *vcpu_worker(void *data)
 {
-       int ret;
+       int ret, vcpu_fd;
        struct kvm_vm *vm = data;
        uint64_t *guest_array;
        uint64_t pages_count = 0;
+       struct kvm_signal_mask *sigmask = alloca(offsetof(struct kvm_signal_mask, sigset)
+                                                + sizeof(sigset_t));
+       sigset_t *sigset = (sigset_t *) &sigmask->sigset;
+
+       vcpu_fd = vcpu_get_fd(vm, VCPU_ID);
+
+       /*
+        * SIG_IPI is unblocked atomically while in KVM_RUN.  It causes the
+        * ioctl to return with -EINTR, but it is still pending and we need
+        * to accept it with the sigwait.
+        */
+       sigmask->len = 8;
+       pthread_sigmask(0, NULL, sigset);
+       vcpu_ioctl(vm, VCPU_ID, KVM_SET_SIGNAL_MASK, sigmask);
+       sigaddset(sigset, SIG_IPI);
+       pthread_sigmask(SIG_BLOCK, sigset, NULL);
+
+       sigemptyset(sigset);
+       sigaddset(sigset, SIG_IPI);
 
        guest_array = addr_gva2hva(vm, (vm_vaddr_t)random_array);
 
        while (!READ_ONCE(host_quit)) {
+               /* Clear any existing kick signals */
                generate_random_array(guest_array, TEST_PAGES_PER_LOOP);
                pages_count += TEST_PAGES_PER_LOOP;
                /* Let the guest dirty the random pages */
-               ret = _vcpu_run(vm, VCPU_ID);
-               TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
-               log_mode_after_vcpu_run(vm);
+               ret = ioctl(vcpu_fd, KVM_RUN, NULL);
+               if (ret == -1 && errno == EINTR) {
+                       int sig = -1;
+                       sigwait(sigset, &sig);
+                       assert(sig == SIG_IPI);
+               }
+               log_mode_after_vcpu_run(vm, ret, errno);
        }
 
        pr_info("Dirtied %"PRIu64" pages\n", pages_count);
@@ -304,6 +539,7 @@ static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap)
        uint64_t step = vm_num_host_pages(mode, 1);
        uint64_t page;
        uint64_t *value_ptr;
+       uint64_t min_iter = 0;
 
        for (page = 0; page < host_num_pages; page += step) {
                value_ptr = host_test_mem + page * host_page_size;
@@ -318,14 +554,64 @@ static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap)
                }
 
                if (test_and_clear_bit_le(page, bmap)) {
+                       bool matched;
+
                        host_dirty_count++;
+
                        /*
                         * If the bit is set, the value written onto
                         * the corresponding page should be either the
                         * previous iteration number or the current one.
                         */
-                       TEST_ASSERT(*value_ptr == iteration ||
-                                   *value_ptr == iteration - 1,
+                       matched = (*value_ptr == iteration ||
+                                  *value_ptr == iteration - 1);
+
+                       if (host_log_mode == LOG_MODE_DIRTY_RING && !matched) {
+                               if (*value_ptr == iteration - 2 && min_iter <= iteration - 2) {
+                                       /*
+                                        * Short answer: this case is special
+                                        * only for dirty ring test where the
+                                        * page is the last page before a kvm
+                                        * dirty ring full in iteration N-2.
+                                        *
+                                        * Long answer: Assuming ring size R,
+                                        * one possible condition is:
+                                        *
+                                        *      main thr       vcpu thr
+                                        *      --------       --------
+                                        *    iter=1
+                                        *                   write 1 to page 0~(R-1)
+                                        *                   full, vmexit
+                                        *    collect 0~(R-1)
+                                        *    kick vcpu
+                                        *                   write 1 to (R-1)~(2R-2)
+                                        *                   full, vmexit
+                                        *    iter=2
+                                        *    collect (R-1)~(2R-2)
+                                        *    kick vcpu
+                                        *                   write 1 to (2R-2)
+                                        *                   (NOTE!!! "1" cached in cpu reg)
+                                        *                   write 2 to (2R-1)~(3R-3)
+                                        *                   full, vmexit
+                                        *    iter=3
+                                        *    collect (2R-2)~(3R-3)
+                                        *    (here if we read value on page
+                                        *     "2R-2" is 1, while iter=3!!!)
+                                        *
+                                        * This however can only happen once per iteration.
+                                        */
+                                       min_iter = iteration - 1;
+                                       continue;
+                               } else if (page == dirty_ring_last_page) {
+                                       /*
+                                        * Please refer to comments in
+                                        * dirty_ring_last_page.
+                                        */
+                                       continue;
+                               }
+                       }
+
+                       TEST_ASSERT(matched,
                                    "Set page %"PRIu64" value %"PRIu64
                                    " incorrect (iteration=%"PRIu64")",
                                    page, *value_ptr, iteration);
@@ -390,7 +676,6 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid,
 static void run_test(enum vm_guest_mode mode, unsigned long iterations,
                     unsigned long interval, uint64_t phys_offset)
 {
-       pthread_t vcpu_thread;
        struct kvm_vm *vm;
        unsigned long *bmap;
 
@@ -455,9 +740,6 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
        /* Cache the HVA pointer of the region */
        host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem);
 
-#ifdef __x86_64__
-       vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
-#endif
        ucall_init(vm, NULL);
 
        /* Export the shared variables to the guest */
@@ -488,6 +770,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
 
        /* Tell the vcpu thread to quit */
        host_quit = true;
+       log_mode_before_vcpu_join();
        pthread_join(vcpu_thread, NULL);
 
        pr_info("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), "
@@ -518,6 +801,9 @@ static void help(char *name)
        printf("usage: %s [-h] [-i iterations] [-I interval] "
               "[-p offset] [-m mode]\n", name);
        puts("");
+       printf(" -c: specify dirty ring size, in number of entries\n");
+       printf("     (only useful for dirty-ring test; default: %"PRIu32")\n",
+              TEST_DIRTY_RING_COUNT);
        printf(" -i: specify iteration counts (default: %"PRIu64")\n",
               TEST_HOST_LOOP_N);
        printf(" -I: specify interval in ms (default: %"PRIu64" ms)\n",
@@ -548,6 +834,9 @@ int main(int argc, char *argv[])
        unsigned int mode;
        int opt, i, j;
 
+       sem_init(&dirty_ring_vcpu_stop, 0, 0);
+       sem_init(&dirty_ring_vcpu_cont, 0, 0);
+
 #ifdef __x86_64__
        guest_mode_init(VM_MODE_PXXV48_4K, true, true);
 #endif
@@ -570,8 +859,11 @@ int main(int argc, char *argv[])
        guest_mode_init(VM_MODE_P40V48_4K, true, true);
 #endif
 
-       while ((opt = getopt(argc, argv, "hi:I:p:m:M:")) != -1) {
+       while ((opt = getopt(argc, argv, "c:hi:I:p:m:M:")) != -1) {
                switch (opt) {
+               case 'c':
+                       test_dirty_ring_count = strtol(optarg, NULL, 10);
+                       break;
                case 'i':
                        iterations = strtol(optarg, NULL, 10);
                        break;