tools/testing/selftests/kvm/dirty_log_test.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * KVM dirty page logging test
   4  *
   5  * Copyright (C) 2018, Red Hat, Inc.
   6  */
   7
   8 #define _GNU_SOURCE /* for program_invocation_name */
   9
  10 #include <stdio.h>
  11 #include <stdlib.h>
  12 #include <pthread.h>
  13 #include <semaphore.h>
  14 #include <sys/types.h>
  15 #include <signal.h>
  16 #include <errno.h>
  17 #include <linux/bitmap.h>
  18 #include <linux/bitops.h>
  19 #include <linux/atomic.h>
  20 #include <asm/barrier.h>
  21
  22 #include "kvm_util.h"
  23 #include "test_util.h"
  24 #include "guest_modes.h"
  25 #include "processor.h"
  26
  27 #define DIRTY_MEM_BITS 30 /* 1G */
  28 #define PAGE_SHIFT_4K  12
  29
  30 /* The memory slot index to track dirty pages */
  31 #define TEST_MEM_SLOT_INDEX             1
  32
  33 /* Default guest test virtual memory offset */
  34 #define DEFAULT_GUEST_TEST_MEM          0xc0000000
  35
  36 /* How many pages to dirty for each guest loop */
  37 #define TEST_PAGES_PER_LOOP             1024
  38
  39 /* How many host loops to run (one KVM_GET_DIRTY_LOG for each loop) */
  40 #define TEST_HOST_LOOP_N                32UL
  41
  42 /* Interval for each host loop (ms) */
  43 #define TEST_HOST_LOOP_INTERVAL         10UL
  44
  45 /* Dirty bitmaps are always little endian, so we need to swap on big endian */
  46 #if defined(__s390x__)
  47 # define BITOP_LE_SWIZZLE       ((BITS_PER_LONG-1) & ~0x7)
  48 # define test_bit_le(nr, addr) \
  49         test_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
  50 # define __set_bit_le(nr, addr) \
  51         __set_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
  52 # define __clear_bit_le(nr, addr) \
  53         __clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
  54 # define __test_and_set_bit_le(nr, addr) \
  55         __test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
  56 # define __test_and_clear_bit_le(nr, addr) \
  57         __test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
  58 #else
  59 # define test_bit_le                    test_bit
  60 # define __set_bit_le                   __set_bit
  61 # define __clear_bit_le                 __clear_bit
  62 # define __test_and_set_bit_le          __test_and_set_bit
  63 # define __test_and_clear_bit_le        __test_and_clear_bit
  64 #endif
  65
  66 #define TEST_DIRTY_RING_COUNT           65536
  67
  68 #define SIG_IPI SIGUSR1
  69
  70 /*
  71  * Guest/Host shared variables. Ensure addr_gva2hva() and/or
  72  * sync_global_to/from_guest() are used when accessing from
  73  * the host. READ/WRITE_ONCE() should also be used with anything
  74  * that may change.
  75  */
  76 static uint64_t host_page_size;
  77 static uint64_t guest_page_size;
  78 static uint64_t guest_num_pages;
  79 static uint64_t random_array[TEST_PAGES_PER_LOOP];
  80 static uint64_t iteration;
  81
  82 /*
  83  * Guest physical memory offset of the testing memory slot.
  84  * This will be set to the topmost valid physical address minus
  85  * the test memory size.
  86  */
  87 static uint64_t guest_test_phys_mem;
  88
  89 /*
  90  * Guest virtual memory offset of the testing memory slot.
  91  * Must not conflict with identity mapped test code.
  92  */
  93 static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM;
  94
  95 /*
  96  * Continuously write to the first 8 bytes of a random pages within
  97  * the testing memory region.
  98  */
  99 static void guest_code(void)
 100 {
 101         uint64_t addr;
 102         int i;
 103
 104         /*
 105          * On s390x, all pages of a 1M segment are initially marked as dirty
 106          * when a page of the segment is written to for the very first time.
 107          * To compensate this specialty in this test, we need to touch all
 108          * pages during the first iteration.
 109          */
 110         for (i = 0; i < guest_num_pages; i++) {
 111                 addr = guest_test_virt_mem + i * guest_page_size;
 112                 *(uint64_t *)addr = READ_ONCE(iteration);
 113         }
 114
 115         while (true) {
 116                 for (i = 0; i < TEST_PAGES_PER_LOOP; i++) {
 117                         addr = guest_test_virt_mem;
 118                         addr += (READ_ONCE(random_array[i]) % guest_num_pages)
 119                                 * guest_page_size;
 120                         addr = align_down(addr, host_page_size);
 121                         *(uint64_t *)addr = READ_ONCE(iteration);
 122                 }
 123
 124                 /* Tell the host that we need more random numbers */
 125                 GUEST_SYNC(1);
 126         }
 127 }
 128
 129 /* Host variables */
 130 static bool host_quit;
 131
 132 /* Points to the test VM memory region on which we track dirty logs */
 133 static void *host_test_mem;
 134 static uint64_t host_num_pages;
 135
 136 /* For statistics only */
 137 static uint64_t host_dirty_count;
 138 static uint64_t host_clear_count;
 139 static uint64_t host_track_next_count;
 140
 141 /* Whether dirty ring reset is requested, or finished */
 142 static sem_t sem_vcpu_stop;
 143 static sem_t sem_vcpu_cont;
 144 /*
 145  * This is only set by main thread, and only cleared by vcpu thread.  It is
 146  * used to request vcpu thread to stop at the next GUEST_SYNC, since GUEST_SYNC
 147  * is the only place that we'll guarantee both "dirty bit" and "dirty data"
 148  * will match.  E.g., SIG_IPI won't guarantee that if the vcpu is interrupted
 149  * after setting dirty bit but before the data is written.
 150  */
 151 static atomic_t vcpu_sync_stop_requested;
 152 /*
 153  * This is updated by the vcpu thread to tell the host whether it's a
 154  * ring-full event.  It should only be read until a sem_wait() of
 155  * sem_vcpu_stop and before vcpu continues to run.
 156  */
 157 static bool dirty_ring_vcpu_ring_full;
 158 /*
 159  * This is only used for verifying the dirty pages.  Dirty ring has a very
 160  * tricky case when the ring just got full, kvm will do userspace exit due to
 161  * ring full.  When that happens, the very last PFN is set but actually the
 162  * data is not changed (the guest WRITE is not really applied yet), because
 163  * we found that the dirty ring is full, refused to continue the vcpu, and
 164  * recorded the dirty gfn with the old contents.
 165  *
 166  * For this specific case, it's safe to skip checking this pfn for this
 167  * bit, because it's a redundant bit, and when the write happens later the bit
 168  * will be set again.  We use this variable to always keep track of the latest
 169  * dirty gfn we've collected, so that if a mismatch of data found later in the
 170  * verifying process, we let it pass.
 171  */
 172 static uint64_t dirty_ring_last_page;
 173
 174 enum log_mode_t {
 175         /* Only use KVM_GET_DIRTY_LOG for logging */
 176         LOG_MODE_DIRTY_LOG = 0,
 177
 178         /* Use both KVM_[GET|CLEAR]_DIRTY_LOG for logging */
 179         LOG_MODE_CLEAR_LOG = 1,
 180
 181         /* Use dirty ring for logging */
 182         LOG_MODE_DIRTY_RING = 2,
 183
 184         LOG_MODE_NUM,
 185
 186         /* Run all supported modes */
 187         LOG_MODE_ALL = LOG_MODE_NUM,
 188 };
 189
 190 /* Mode of logging to test.  Default is to run all supported modes */
 191 static enum log_mode_t host_log_mode_option = LOG_MODE_ALL;
 192 /* Logging mode for current run */
 193 static enum log_mode_t host_log_mode;
 194 static pthread_t vcpu_thread;
 195 static uint32_t test_dirty_ring_count = TEST_DIRTY_RING_COUNT;
 196
 197 static void vcpu_kick(void)
 198 {
 199         pthread_kill(vcpu_thread, SIG_IPI);
 200 }
 201
 202 /*
 203  * In our test we do signal tricks, let's use a better version of
 204  * sem_wait to avoid signal interrupts
 205  */
 206 static void sem_wait_until(sem_t *sem)
 207 {
 208         int ret;
 209
 210         do
 211                 ret = sem_wait(sem);
 212         while (ret == -1 && errno == EINTR);
 213 }
 214
 215 static bool clear_log_supported(void)
 216 {
 217         return kvm_has_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
 218 }
 219
 220 static void clear_log_create_vm_done(struct kvm_vm *vm)
 221 {
 222         u64 manual_caps;
 223
 224         manual_caps = kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
 225         TEST_ASSERT(manual_caps, "MANUAL_CAPS is zero!");
 226         manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
 227                         KVM_DIRTY_LOG_INITIALLY_SET);
 228         vm_enable_cap(vm, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, manual_caps);
 229 }
 230
 231 static void dirty_log_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot,
 232                                           void *bitmap, uint32_t num_pages,
 233                                           uint32_t *unused)
 234 {
 235         kvm_vm_get_dirty_log(vcpu->vm, slot, bitmap);
 236 }
 237
 238 static void clear_log_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot,
 239                                           void *bitmap, uint32_t num_pages,
 240                                           uint32_t *unused)
 241 {
 242         kvm_vm_get_dirty_log(vcpu->vm, slot, bitmap);
 243         kvm_vm_clear_dirty_log(vcpu->vm, slot, bitmap, 0, num_pages);
 244 }
 245
 246 /* Should only be called after a GUEST_SYNC */
 247 static void vcpu_handle_sync_stop(void)
 248 {
 249         if (atomic_read(&vcpu_sync_stop_requested)) {
 250                 /* It means main thread is sleeping waiting */
 251                 atomic_set(&vcpu_sync_stop_requested, false);
 252                 sem_post(&sem_vcpu_stop);
 253                 sem_wait_until(&sem_vcpu_cont);
 254         }
 255 }
 256
 257 static void default_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err)
 258 {
 259         struct kvm_run *run = vcpu->run;
 260
 261         TEST_ASSERT(ret == 0 || (ret == -1 && err == EINTR),
 262                     "vcpu run failed: errno=%d", err);
 263
 264         TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC,
 265                     "Invalid guest sync status: exit_reason=%s",
 266                     exit_reason_str(run->exit_reason));
 267
 268         vcpu_handle_sync_stop();
 269 }
 270
 271 static bool dirty_ring_supported(void)
 272 {
 273         return (kvm_has_cap(KVM_CAP_DIRTY_LOG_RING) ||
 274                 kvm_has_cap(KVM_CAP_DIRTY_LOG_RING_ACQ_REL));
 275 }
 276
 277 static void dirty_ring_create_vm_done(struct kvm_vm *vm)
 278 {
 279         uint64_t pages;
 280         uint32_t limit;
 281
 282         /*
 283          * We rely on vcpu exit due to full dirty ring state. Adjust
 284          * the ring buffer size to ensure we're able to reach the
 285          * full dirty ring state.
 286          */
 287         pages = (1ul << (DIRTY_MEM_BITS - vm->page_shift)) + 3;
 288         pages = vm_adjust_num_guest_pages(vm->mode, pages);
 289         if (vm->page_size < getpagesize())
 290                 pages = vm_num_host_pages(vm->mode, pages);
 291
 292         limit = 1 << (31 - __builtin_clz(pages));
 293         test_dirty_ring_count = 1 << (31 - __builtin_clz(test_dirty_ring_count));
 294         test_dirty_ring_count = min(limit, test_dirty_ring_count);
 295         pr_info("dirty ring count: 0x%x\n", test_dirty_ring_count);
 296
 297         /*
 298          * Switch to dirty ring mode after VM creation but before any
 299          * of the vcpu creation.
 300          */
 301         vm_enable_dirty_ring(vm, test_dirty_ring_count *
 302                              sizeof(struct kvm_dirty_gfn));
 303 }
 304
 305 static inline bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
 306 {
 307         return smp_load_acquire(&gfn->flags) == KVM_DIRTY_GFN_F_DIRTY;
 308 }
 309
 310 static inline void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
 311 {
 312         smp_store_release(&gfn->flags, KVM_DIRTY_GFN_F_RESET);
 313 }
 314
 315 static uint32_t dirty_ring_collect_one(struct kvm_dirty_gfn *dirty_gfns,
 316                                        int slot, void *bitmap,
 317                                        uint32_t num_pages, uint32_t *fetch_index)
 318 {
 319         struct kvm_dirty_gfn *cur;
 320         uint32_t count = 0;
 321
 322         while (true) {
 323                 cur = &dirty_gfns[*fetch_index % test_dirty_ring_count];
 324                 if (!dirty_gfn_is_dirtied(cur))
 325                         break;
 326                 TEST_ASSERT(cur->slot == slot, "Slot number didn't match: "
 327                             "%u != %u", cur->slot, slot);
 328                 TEST_ASSERT(cur->offset < num_pages, "Offset overflow: "
 329                             "0x%llx >= 0x%x", cur->offset, num_pages);
 330                 //pr_info("fetch 0x%x page %llu\n", *fetch_index, cur->offset);
 331                 __set_bit_le(cur->offset, bitmap);
 332                 dirty_ring_last_page = cur->offset;
 333                 dirty_gfn_set_collected(cur);
 334                 (*fetch_index)++;
 335                 count++;
 336         }
 337
 338         return count;
 339 }
 340
 341 static void dirty_ring_wait_vcpu(void)
 342 {
 343         /* This makes sure that hardware PML cache flushed */
 344         vcpu_kick();
 345         sem_wait_until(&sem_vcpu_stop);
 346 }
 347
 348 static void dirty_ring_continue_vcpu(void)
 349 {
 350         pr_info("Notifying vcpu to continue\n");
 351         sem_post(&sem_vcpu_cont);
 352 }
 353
 354 static void dirty_ring_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot,
 355                                            void *bitmap, uint32_t num_pages,
 356                                            uint32_t *ring_buf_idx)
 357 {
 358         uint32_t count = 0, cleared;
 359         bool continued_vcpu = false;
 360
 361         dirty_ring_wait_vcpu();
 362
 363         if (!dirty_ring_vcpu_ring_full) {
 364                 /*
 365                  * This is not a ring-full event, it's safe to allow
 366                  * vcpu to continue
 367                  */
 368                 dirty_ring_continue_vcpu();
 369                 continued_vcpu = true;
 370         }
 371
 372         /* Only have one vcpu */
 373         count = dirty_ring_collect_one(vcpu_map_dirty_ring(vcpu),
 374                                        slot, bitmap, num_pages,
 375                                        ring_buf_idx);
 376
 377         cleared = kvm_vm_reset_dirty_ring(vcpu->vm);
 378
 379         /*
 380          * Cleared pages should be the same as collected, as KVM is supposed to
 381          * clear only the entries that have been harvested.
 382          */
 383         TEST_ASSERT(cleared == count, "Reset dirty pages (%u) mismatch "
 384                     "with collected (%u)", cleared, count);
 385
 386         if (!continued_vcpu) {
 387                 TEST_ASSERT(dirty_ring_vcpu_ring_full,
 388                             "Didn't continue vcpu even without ring full");
 389                 dirty_ring_continue_vcpu();
 390         }
 391
 392         pr_info("Iteration %ld collected %u pages\n", iteration, count);
 393 }
 394
 395 static void dirty_ring_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err)
 396 {
 397         struct kvm_run *run = vcpu->run;
 398
 399         /* A ucall-sync or ring-full event is allowed */
 400         if (get_ucall(vcpu, NULL) == UCALL_SYNC) {
 401                 /* We should allow this to continue */
 402                 ;
 403         } else if (run->exit_reason == KVM_EXIT_DIRTY_RING_FULL ||
 404                    (ret == -1 && err == EINTR)) {
 405                 /* Update the flag first before pause */
 406                 WRITE_ONCE(dirty_ring_vcpu_ring_full,
 407                            run->exit_reason == KVM_EXIT_DIRTY_RING_FULL);
 408                 sem_post(&sem_vcpu_stop);
 409                 pr_info("vcpu stops because %s...\n",
 410                         dirty_ring_vcpu_ring_full ?
 411                         "dirty ring is full" : "vcpu is kicked out");
 412                 sem_wait_until(&sem_vcpu_cont);
 413                 pr_info("vcpu continues now.\n");
 414         } else {
 415                 TEST_ASSERT(false, "Invalid guest sync status: "
 416                             "exit_reason=%s",
 417                             exit_reason_str(run->exit_reason));
 418         }
 419 }
 420
 421 struct log_mode {
 422         const char *name;
 423         /* Return true if this mode is supported, otherwise false */
 424         bool (*supported)(void);
 425         /* Hook when the vm creation is done (before vcpu creation) */
 426         void (*create_vm_done)(struct kvm_vm *vm);
 427         /* Hook to collect the dirty pages into the bitmap provided */
 428         void (*collect_dirty_pages) (struct kvm_vcpu *vcpu, int slot,
 429                                      void *bitmap, uint32_t num_pages,
 430                                      uint32_t *ring_buf_idx);
 431         /* Hook to call when after each vcpu run */
 432         void (*after_vcpu_run)(struct kvm_vcpu *vcpu, int ret, int err);
 433 } log_modes[LOG_MODE_NUM] = {
 434         {
 435                 .name = "dirty-log",
 436                 .collect_dirty_pages = dirty_log_collect_dirty_pages,
 437                 .after_vcpu_run = default_after_vcpu_run,
 438         },
 439         {
 440                 .name = "clear-log",
 441                 .supported = clear_log_supported,
 442                 .create_vm_done = clear_log_create_vm_done,
 443                 .collect_dirty_pages = clear_log_collect_dirty_pages,
 444                 .after_vcpu_run = default_after_vcpu_run,
 445         },
 446         {
 447                 .name = "dirty-ring",
 448                 .supported = dirty_ring_supported,
 449                 .create_vm_done = dirty_ring_create_vm_done,
 450                 .collect_dirty_pages = dirty_ring_collect_dirty_pages,
 451                 .after_vcpu_run = dirty_ring_after_vcpu_run,
 452         },
 453 };
 454
 455 /*
 456  * We use this bitmap to track some pages that should have its dirty
 457  * bit set in the _next_ iteration.  For example, if we detected the
 458  * page value changed to current iteration but at the same time the
 459  * page bit is cleared in the latest bitmap, then the system must
 460  * report that write in the next get dirty log call.
 461  */
 462 static unsigned long *host_bmap_track;
 463
 464 static void log_modes_dump(void)
 465 {
 466         int i;
 467
 468         printf("all");
 469         for (i = 0; i < LOG_MODE_NUM; i++)
 470                 printf(", %s", log_modes[i].name);
 471         printf("\n");
 472 }
 473
 474 static bool log_mode_supported(void)
 475 {
 476         struct log_mode *mode = &log_modes[host_log_mode];
 477
 478         if (mode->supported)
 479                 return mode->supported();
 480
 481         return true;
 482 }
 483
 484 static void log_mode_create_vm_done(struct kvm_vm *vm)
 485 {
 486         struct log_mode *mode = &log_modes[host_log_mode];
 487
 488         if (mode->create_vm_done)
 489                 mode->create_vm_done(vm);
 490 }
 491
 492 static void log_mode_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot,
 493                                          void *bitmap, uint32_t num_pages,
 494                                          uint32_t *ring_buf_idx)
 495 {
 496         struct log_mode *mode = &log_modes[host_log_mode];
 497
 498         TEST_ASSERT(mode->collect_dirty_pages != NULL,
 499                     "collect_dirty_pages() is required for any log mode!");
 500         mode->collect_dirty_pages(vcpu, slot, bitmap, num_pages, ring_buf_idx);
 501 }
 502
 503 static void log_mode_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err)
 504 {
 505         struct log_mode *mode = &log_modes[host_log_mode];
 506
 507         if (mode->after_vcpu_run)
 508                 mode->after_vcpu_run(vcpu, ret, err);
 509 }
 510
 511 static void generate_random_array(uint64_t *guest_array, uint64_t size)
 512 {
 513         uint64_t i;
 514
 515         for (i = 0; i < size; i++)
 516                 guest_array[i] = random();
 517 }
 518
 519 static void *vcpu_worker(void *data)
 520 {
 521         int ret;
 522         struct kvm_vcpu *vcpu = data;
 523         struct kvm_vm *vm = vcpu->vm;
 524         uint64_t *guest_array;
 525         uint64_t pages_count = 0;
 526         struct kvm_signal_mask *sigmask = alloca(offsetof(struct kvm_signal_mask, sigset)
 527                                                  + sizeof(sigset_t));
 528         sigset_t *sigset = (sigset_t *) &sigmask->sigset;
 529
 530         /*
 531          * SIG_IPI is unblocked atomically while in KVM_RUN.  It causes the
 532          * ioctl to return with -EINTR, but it is still pending and we need
 533          * to accept it with the sigwait.
 534          */
 535         sigmask->len = 8;
 536         pthread_sigmask(0, NULL, sigset);
 537         sigdelset(sigset, SIG_IPI);
 538         vcpu_ioctl(vcpu, KVM_SET_SIGNAL_MASK, sigmask);
 539
 540         sigemptyset(sigset);
 541         sigaddset(sigset, SIG_IPI);
 542
 543         guest_array = addr_gva2hva(vm, (vm_vaddr_t)random_array);
 544
 545         while (!READ_ONCE(host_quit)) {
 546                 /* Clear any existing kick signals */
 547                 generate_random_array(guest_array, TEST_PAGES_PER_LOOP);
 548                 pages_count += TEST_PAGES_PER_LOOP;
 549                 /* Let the guest dirty the random pages */
 550                 ret = __vcpu_run(vcpu);
 551                 if (ret == -1 && errno == EINTR) {
 552                         int sig = -1;
 553                         sigwait(sigset, &sig);
 554                         assert(sig == SIG_IPI);
 555                 }
 556                 log_mode_after_vcpu_run(vcpu, ret, errno);
 557         }
 558
 559         pr_info("Dirtied %"PRIu64" pages\n", pages_count);
 560
 561         return NULL;
 562 }
 563
 564 static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap)
 565 {
 566         uint64_t step = vm_num_host_pages(mode, 1);
 567         uint64_t page;
 568         uint64_t *value_ptr;
 569         uint64_t min_iter = 0;
 570
 571         for (page = 0; page < host_num_pages; page += step) {
 572                 value_ptr = host_test_mem + page * host_page_size;
 573
 574                 /* If this is a special page that we were tracking... */
 575                 if (__test_and_clear_bit_le(page, host_bmap_track)) {
 576                         host_track_next_count++;
 577                         TEST_ASSERT(test_bit_le(page, bmap),
 578                                     "Page %"PRIu64" should have its dirty bit "
 579                                     "set in this iteration but it is missing",
 580                                     page);
 581                 }
 582
 583                 if (__test_and_clear_bit_le(page, bmap)) {
 584                         bool matched;
 585
 586                         host_dirty_count++;
 587
 588                         /*
 589                          * If the bit is set, the value written onto
 590                          * the corresponding page should be either the
 591                          * previous iteration number or the current one.
 592                          */
 593                         matched = (*value_ptr == iteration ||
 594                                    *value_ptr == iteration - 1);
 595
 596                         if (host_log_mode == LOG_MODE_DIRTY_RING && !matched) {
 597                                 if (*value_ptr == iteration - 2 && min_iter <= iteration - 2) {
 598                                         /*
 599                                          * Short answer: this case is special
 600                                          * only for dirty ring test where the
 601                                          * page is the last page before a kvm
 602                                          * dirty ring full in iteration N-2.
 603                                          *
 604                                          * Long answer: Assuming ring size R,
 605                                          * one possible condition is:
 606                                          *
 607                                          *      main thr       vcpu thr
 608                                          *      --------       --------
 609                                          *    iter=1
 610                                          *                   write 1 to page 0~(R-1)
 611                                          *                   full, vmexit
 612                                          *    collect 0~(R-1)
 613                                          *    kick vcpu
 614                                          *                   write 1 to (R-1)~(2R-2)
 615                                          *                   full, vmexit
 616                                          *    iter=2
 617                                          *    collect (R-1)~(2R-2)
 618                                          *    kick vcpu
 619                                          *                   write 1 to (2R-2)
 620                                          *                   (NOTE!!! "1" cached in cpu reg)
 621                                          *                   write 2 to (2R-1)~(3R-3)
 622                                          *                   full, vmexit
 623                                          *    iter=3
 624                                          *    collect (2R-2)~(3R-3)
 625                                          *    (here if we read value on page
 626                                          *     "2R-2" is 1, while iter=3!!!)
 627                                          *
 628                                          * This however can only happen once per iteration.
 629                                          */
 630                                         min_iter = iteration - 1;
 631                                         continue;
 632                                 } else if (page == dirty_ring_last_page) {
 633                                         /*
 634                                          * Please refer to comments in
 635                                          * dirty_ring_last_page.
 636                                          */
 637                                         continue;
 638                                 }
 639                         }
 640
 641                         TEST_ASSERT(matched,
 642                                     "Set page %"PRIu64" value %"PRIu64
 643                                     " incorrect (iteration=%"PRIu64")",
 644                                     page, *value_ptr, iteration);
 645                 } else {
 646                         host_clear_count++;
 647                         /*
 648                          * If cleared, the value written can be any
 649                          * value smaller or equals to the iteration
 650                          * number.  Note that the value can be exactly
 651                          * (iteration-1) if that write can happen
 652                          * like this:
 653                          *
 654                          * (1) increase loop count to "iteration-1"
 655                          * (2) write to page P happens (with value
 656                          *     "iteration-1")
 657                          * (3) get dirty log for "iteration-1"; we'll
 658                          *     see that page P bit is set (dirtied),
 659                          *     and not set the bit in host_bmap_track
 660                          * (4) increase loop count to "iteration"
 661                          *     (which is current iteration)
 662                          * (5) get dirty log for current iteration,
 663                          *     we'll see that page P is cleared, with
 664                          *     value "iteration-1".
 665                          */
 666                         TEST_ASSERT(*value_ptr <= iteration,
 667                                     "Clear page %"PRIu64" value %"PRIu64
 668                                     " incorrect (iteration=%"PRIu64")",
 669                                     page, *value_ptr, iteration);
 670                         if (*value_ptr == iteration) {
 671                                 /*
 672                                  * This page is _just_ modified; it
 673                                  * should report its dirtyness in the
 674                                  * next run
 675                                  */
 676                                 __set_bit_le(page, host_bmap_track);
 677                         }
 678                 }
 679         }
 680 }
 681
 682 static struct kvm_vm *create_vm(enum vm_guest_mode mode, struct kvm_vcpu **vcpu,
 683                                 uint64_t extra_mem_pages, void *guest_code)
 684 {
 685         struct kvm_vm *vm;
 686
 687         pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
 688
 689         vm = __vm_create(VM_SHAPE(mode), 1, extra_mem_pages);
 690
 691         log_mode_create_vm_done(vm);
 692         *vcpu = vm_vcpu_add(vm, 0, guest_code);
 693         return vm;
 694 }
 695
 696 struct test_params {
 697         unsigned long iterations;
 698         unsigned long interval;
 699         uint64_t phys_offset;
 700 };
 701
 702 static void run_test(enum vm_guest_mode mode, void *arg)
 703 {
 704         struct test_params *p = arg;
 705         struct kvm_vcpu *vcpu;
 706         struct kvm_vm *vm;
 707         unsigned long *bmap;
 708         uint32_t ring_buf_idx = 0;
 709         int sem_val;
 710
 711         if (!log_mode_supported()) {
 712                 print_skip("Log mode '%s' not supported",
 713                            log_modes[host_log_mode].name);
 714                 return;
 715         }
 716
 717         /*
 718          * We reserve page table for 2 times of extra dirty mem which
 719          * will definitely cover the original (1G+) test range.  Here
 720          * we do the calculation with 4K page size which is the
 721          * smallest so the page number will be enough for all archs
 722          * (e.g., 64K page size guest will need even less memory for
 723          * page tables).
 724          */
 725         vm = create_vm(mode, &vcpu,
 726                        2ul << (DIRTY_MEM_BITS - PAGE_SHIFT_4K), guest_code);
 727
 728         guest_page_size = vm->page_size;
 729         /*
 730          * A little more than 1G of guest page sized pages.  Cover the
 731          * case where the size is not aligned to 64 pages.
 732          */
 733         guest_num_pages = (1ul << (DIRTY_MEM_BITS - vm->page_shift)) + 3;
 734         guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
 735
 736         host_page_size = getpagesize();
 737         host_num_pages = vm_num_host_pages(mode, guest_num_pages);
 738
 739         if (!p->phys_offset) {
 740                 guest_test_phys_mem = (vm->max_gfn - guest_num_pages) *
 741                                       guest_page_size;
 742                 guest_test_phys_mem = align_down(guest_test_phys_mem, host_page_size);
 743         } else {
 744                 guest_test_phys_mem = p->phys_offset;
 745         }
 746
 747 #ifdef __s390x__
 748         /* Align to 1M (segment size) */
 749         guest_test_phys_mem = align_down(guest_test_phys_mem, 1 << 20);
 750 #endif
 751
 752         pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem);
 753
 754         bmap = bitmap_zalloc(host_num_pages);
 755         host_bmap_track = bitmap_zalloc(host_num_pages);
 756
 757         /* Add an extra memory slot for testing dirty logging */
 758         vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
 759                                     guest_test_phys_mem,
 760                                     TEST_MEM_SLOT_INDEX,
 761                                     guest_num_pages,
 762                                     KVM_MEM_LOG_DIRTY_PAGES);
 763
 764         /* Do mapping for the dirty track memory slot */
 765         virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages);
 766
 767         /* Cache the HVA pointer of the region */
 768         host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem);
 769
 770         /* Export the shared variables to the guest */
 771         sync_global_to_guest(vm, host_page_size);
 772         sync_global_to_guest(vm, guest_page_size);
 773         sync_global_to_guest(vm, guest_test_virt_mem);
 774         sync_global_to_guest(vm, guest_num_pages);
 775
 776         /* Start the iterations */
 777         iteration = 1;
 778         sync_global_to_guest(vm, iteration);
 779         WRITE_ONCE(host_quit, false);
 780         host_dirty_count = 0;
 781         host_clear_count = 0;
 782         host_track_next_count = 0;
 783         WRITE_ONCE(dirty_ring_vcpu_ring_full, false);
 784
 785         /*
 786          * Ensure the previous iteration didn't leave a dangling semaphore, i.e.
 787          * that the main task and vCPU worker were synchronized and completed
 788          * verification of all iterations.
 789          */
 790         sem_getvalue(&sem_vcpu_stop, &sem_val);
 791         TEST_ASSERT_EQ(sem_val, 0);
 792         sem_getvalue(&sem_vcpu_cont, &sem_val);
 793         TEST_ASSERT_EQ(sem_val, 0);
 794
 795         pthread_create(&vcpu_thread, NULL, vcpu_worker, vcpu);
 796
 797         while (iteration < p->iterations) {
 798                 /* Give the vcpu thread some time to dirty some pages */
 799                 usleep(p->interval * 1000);
 800                 log_mode_collect_dirty_pages(vcpu, TEST_MEM_SLOT_INDEX,
 801                                              bmap, host_num_pages,
 802                                              &ring_buf_idx);
 803
 804                 /*
 805                  * See vcpu_sync_stop_requested definition for details on why
 806                  * we need to stop vcpu when verify data.
 807                  */
 808                 atomic_set(&vcpu_sync_stop_requested, true);
 809                 sem_wait_until(&sem_vcpu_stop);
 810                 /*
 811                  * NOTE: for dirty ring, it's possible that we didn't stop at
 812                  * GUEST_SYNC but instead we stopped because ring is full;
 813                  * that's okay too because ring full means we're only missing
 814                  * the flush of the last page, and since we handle the last
 815                  * page specially verification will succeed anyway.
 816                  */
 817                 assert(host_log_mode == LOG_MODE_DIRTY_RING ||
 818                        atomic_read(&vcpu_sync_stop_requested) == false);
 819                 vm_dirty_log_verify(mode, bmap);
 820
 821                 /*
 822                  * Set host_quit before sem_vcpu_cont in the final iteration to
 823                  * ensure that the vCPU worker doesn't resume the guest.  As
 824                  * above, the dirty ring test may stop and wait even when not
 825                  * explicitly request to do so, i.e. would hang waiting for a
 826                  * "continue" if it's allowed to resume the guest.
 827                  */
 828                 if (++iteration == p->iterations)
 829                         WRITE_ONCE(host_quit, true);
 830
 831                 sem_post(&sem_vcpu_cont);
 832                 sync_global_to_guest(vm, iteration);
 833         }
 834
 835         pthread_join(vcpu_thread, NULL);
 836
 837         pr_info("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), "
 838                 "track_next (%"PRIu64")\n", host_dirty_count, host_clear_count,
 839                 host_track_next_count);
 840
 841         free(bmap);
 842         free(host_bmap_track);
 843         kvm_vm_free(vm);
 844 }
 845
 846 static void help(char *name)
 847 {
 848         puts("");
 849         printf("usage: %s [-h] [-i iterations] [-I interval] "
 850                "[-p offset] [-m mode]\n", name);
 851         puts("");
 852         printf(" -c: hint to dirty ring size, in number of entries\n");
 853         printf("     (only useful for dirty-ring test; default: %"PRIu32")\n",
 854                TEST_DIRTY_RING_COUNT);
 855         printf(" -i: specify iteration counts (default: %"PRIu64")\n",
 856                TEST_HOST_LOOP_N);
 857         printf(" -I: specify interval in ms (default: %"PRIu64" ms)\n",
 858                TEST_HOST_LOOP_INTERVAL);
 859         printf(" -p: specify guest physical test memory offset\n"
 860                "     Warning: a low offset can conflict with the loaded test code.\n");
 861         printf(" -M: specify the host logging mode "
 862                "(default: run all log modes).  Supported modes: \n\t");
 863         log_modes_dump();
 864         guest_modes_help();
 865         puts("");
 866         exit(0);
 867 }
 868
 869 int main(int argc, char *argv[])
 870 {
 871         struct test_params p = {
 872                 .iterations = TEST_HOST_LOOP_N,
 873                 .interval = TEST_HOST_LOOP_INTERVAL,
 874         };
 875         int opt, i;
 876         sigset_t sigset;
 877
 878         sem_init(&sem_vcpu_stop, 0, 0);
 879         sem_init(&sem_vcpu_cont, 0, 0);
 880
 881         guest_modes_append_default();
 882
 883         while ((opt = getopt(argc, argv, "c:hi:I:p:m:M:")) != -1) {
 884                 switch (opt) {
 885                 case 'c':
 886                         test_dirty_ring_count = strtol(optarg, NULL, 10);
 887                         break;
 888                 case 'i':
 889                         p.iterations = strtol(optarg, NULL, 10);
 890                         break;
 891                 case 'I':
 892                         p.interval = strtol(optarg, NULL, 10);
 893                         break;
 894                 case 'p':
 895                         p.phys_offset = strtoull(optarg, NULL, 0);
 896                         break;
 897                 case 'm':
 898                         guest_modes_cmdline(optarg);
 899                         break;
 900                 case 'M':
 901                         if (!strcmp(optarg, "all")) {
 902                                 host_log_mode_option = LOG_MODE_ALL;
 903                                 break;
 904                         }
 905                         for (i = 0; i < LOG_MODE_NUM; i++) {
 906                                 if (!strcmp(optarg, log_modes[i].name)) {
 907                                         pr_info("Setting log mode to: '%s'\n",
 908                                                 optarg);
 909                                         host_log_mode_option = i;
 910                                         break;
 911                                 }
 912                         }
 913                         if (i == LOG_MODE_NUM) {
 914                                 printf("Log mode '%s' invalid. Please choose "
 915                                        "from: ", optarg);
 916                                 log_modes_dump();
 917                                 exit(1);
 918                         }
 919                         break;
 920                 case 'h':
 921                 default:
 922                         help(argv[0]);
 923                         break;
 924                 }
 925         }
 926
 927         TEST_ASSERT(p.iterations > 2, "Iterations must be greater than two");
 928         TEST_ASSERT(p.interval > 0, "Interval must be greater than zero");
 929
 930         pr_info("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n",
 931                 p.iterations, p.interval);
 932
 933         srandom(time(0));
 934
 935         /* Ensure that vCPU threads start with SIG_IPI blocked.  */
 936         sigemptyset(&sigset);
 937         sigaddset(&sigset, SIG_IPI);
 938         pthread_sigmask(SIG_BLOCK, &sigset, NULL);
 939
 940         if (host_log_mode_option == LOG_MODE_ALL) {
 941                 /* Run each log mode */
 942                 for (i = 0; i < LOG_MODE_NUM; i++) {
 943                         pr_info("Testing Log Mode '%s'\n", log_modes[i].name);
 944                         host_log_mode = i;
 945                         for_each_guest_mode(run_test, &p);
 946                 }
 947         } else {
 948                 host_log_mode = host_log_mode_option;
 949                 for_each_guest_mode(run_test, &p);
 950         }
 951
 952         return 0;
 953 }