KVM: Boost vCPU candidate in user mode which is delivering interrupt

[linux-2.6-microblaze.git] / virt / kvm / kvm_main.c
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index 001b9de..2799c66 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -451,59 +451,220 @@ static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
         srcu_read_unlock(&kvm->srcu, idx);
  }
  
-static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
-                                       struct mm_struct *mm,
-                                       unsigned long address,
-                                       pte_t pte)
+typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
+
+typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
+                            unsigned long end);
+
+struct kvm_hva_range {
+       unsigned long start;
+       unsigned long end;
+       pte_t pte;
+       hva_handler_t handler;
+       on_lock_fn_t on_lock;
+       bool flush_on_ret;
+       bool may_block;
+};
+
+/*
+ * Use a dedicated stub instead of NULL to indicate that there is no callback
+ * function/handler.  The compiler technically can't guarantee that a real
+ * function will have a non-zero address, and so it will generate code to
+ * check for !NULL, whereas comparing against a stub will be elided at compile
+ * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
+ */
+static void kvm_null_fn(void)
  {
-       struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int idx;
+
+}
+#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
+
+static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
+                                                 const struct kvm_hva_range *range)
+{
+       bool ret = false, locked = false;
+       struct kvm_gfn_range gfn_range;
+       struct kvm_memory_slot *slot;
+       struct kvm_memslots *slots;
+       int i, idx;
+
+       /* A null handler is allowed if and only if on_lock() is provided. */
+       if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
+                        IS_KVM_NULL_FN(range->handler)))
+               return 0;
  
         idx = srcu_read_lock(&kvm->srcu);
  
-       KVM_MMU_LOCK(kvm);
+       /* The on_lock() path does not yet support lock elision. */
+       if (!IS_KVM_NULL_FN(range->on_lock)) {
+               locked = true;
+               KVM_MMU_LOCK(kvm);
  
-       kvm->mmu_notifier_seq++;
+               range->on_lock(kvm, range->start, range->end);
+
+               if (IS_KVM_NULL_FN(range->handler))
+                       goto out_unlock;
+       }
+
+       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+               slots = __kvm_memslots(kvm, i);
+               kvm_for_each_memslot(slot, slots) {
+                       unsigned long hva_start, hva_end;
+
+                       hva_start = max(range->start, slot->userspace_addr);
+                       hva_end = min(range->end, slot->userspace_addr +
+                                                 (slot->npages << PAGE_SHIFT));
+                       if (hva_start >= hva_end)
+                               continue;
+
+                       /*
+                        * To optimize for the likely case where the address
+                        * range is covered by zero or one memslots, don't
+                        * bother making these conditional (to avoid writes on
+                        * the second or later invocation of the handler).
+                        */
+                       gfn_range.pte = range->pte;
+                       gfn_range.may_block = range->may_block;
+
+                       /*
+                        * {gfn(page) | page intersects with [hva_start, hva_end)} =
+                        * {gfn_start, gfn_start+1, ..., gfn_end-1}.
+                        */
+                       gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
+                       gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
+                       gfn_range.slot = slot;
+
+                       if (!locked) {
+                               locked = true;
+                               KVM_MMU_LOCK(kvm);
+                       }
+                       ret |= range->handler(kvm, &gfn_range);
+               }
+       }
  
-       if (kvm_set_spte_hva(kvm, address, pte))
+       if (range->flush_on_ret && (ret || kvm->tlbs_dirty))
                 kvm_flush_remote_tlbs(kvm);
  
-       KVM_MMU_UNLOCK(kvm);
+out_unlock:
+       if (locked)
+               KVM_MMU_UNLOCK(kvm);
+
         srcu_read_unlock(&kvm->srcu, idx);
+
+       /* The notifiers are averse to booleans. :-( */
+       return (int)ret;
  }
  
-static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
-                                       const struct mmu_notifier_range *range)
+static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
+                                               unsigned long start,
+                                               unsigned long end,
+                                               pte_t pte,
+                                               hva_handler_t handler)
  {
         struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int need_tlb_flush = 0, idx;
+       const struct kvm_hva_range range = {
+               .start          = start,
+               .end            = end,
+               .pte            = pte,
+               .handler        = handler,
+               .on_lock        = (void *)kvm_null_fn,
+               .flush_on_ret   = true,
+               .may_block      = false,
+       };
+
+       return __kvm_handle_hva_range(kvm, &range);
+}
+
+static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
+                                                        unsigned long start,
+                                                        unsigned long end,
+                                                        hva_handler_t handler)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       const struct kvm_hva_range range = {
+               .start          = start,
+               .end            = end,
+               .pte            = __pte(0),
+               .handler        = handler,
+               .on_lock        = (void *)kvm_null_fn,
+               .flush_on_ret   = false,
+               .may_block      = false,
+       };
+
+       return __kvm_handle_hva_range(kvm, &range);
+}
+static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
+                                       struct mm_struct *mm,
+                                       unsigned long address,
+                                       pte_t pte)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+
+       trace_kvm_set_spte_hva(address);
  
-       idx = srcu_read_lock(&kvm->srcu);
-       KVM_MMU_LOCK(kvm);
+       /*
+        * .change_pte() must be surrounded by .invalidate_range_{start,end}(),
+        * and so always runs with an elevated notifier count.  This obviates
+        * the need to bump the sequence count.
+        */
+       WARN_ON_ONCE(!kvm->mmu_notifier_count);
+
+       kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
+}
+
+static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
+                                  unsigned long end)
+{
         /*
          * The count increase must become visible at unlock time as no
          * spte can be established without taking the mmu_lock and
          * count is also read inside the mmu_lock critical section.
          */
         kvm->mmu_notifier_count++;
-       need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
-                                            range->flags);
-       /* we've to flush the tlb before the pages can be freed */
-       if (need_tlb_flush || kvm->tlbs_dirty)
-               kvm_flush_remote_tlbs(kvm);
-
-       KVM_MMU_UNLOCK(kvm);
-       srcu_read_unlock(&kvm->srcu, idx);
-
-       return 0;
+       if (likely(kvm->mmu_notifier_count == 1)) {
+               kvm->mmu_notifier_range_start = start;
+               kvm->mmu_notifier_range_end = end;
+       } else {
+               /*
+                * Fully tracking multiple concurrent ranges has dimishing
+                * returns. Keep things simple and just find the minimal range
+                * which includes the current and new ranges. As there won't be
+                * enough information to subtract a range after its invalidate
+                * completes, any ranges invalidated concurrently will
+                * accumulate and persist until all outstanding invalidates
+                * complete.
+                */
+               kvm->mmu_notifier_range_start =
+                       min(kvm->mmu_notifier_range_start, start);
+               kvm->mmu_notifier_range_end =
+                       max(kvm->mmu_notifier_range_end, end);
+       }
  }
  
-static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
                                         const struct mmu_notifier_range *range)
  {
         struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       const struct kvm_hva_range hva_range = {
+               .start          = range->start,
+               .end            = range->end,
+               .pte            = __pte(0),
+               .handler        = kvm_unmap_gfn_range,
+               .on_lock        = kvm_inc_notifier_count,
+               .flush_on_ret   = true,
+               .may_block      = mmu_notifier_range_blockable(range),
+       };
  
-       KVM_MMU_LOCK(kvm);
+       trace_kvm_unmap_hva_range(range->start, range->end);
+
+       __kvm_handle_hva_range(kvm, &hva_range);
+
+       return 0;
+}
+
+static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
+                                  unsigned long end)
+{
         /*
          * This sequence increase will notify the kvm page fault that
          * the page that is going to be mapped in the spte could have
@@ -517,7 +678,23 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
          * in conjunction with the smp_rmb in mmu_notifier_retry().
          */
         kvm->mmu_notifier_count--;
-       KVM_MMU_UNLOCK(kvm);
+}
+
+static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+                                       const struct mmu_notifier_range *range)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       const struct kvm_hva_range hva_range = {
+               .start          = range->start,
+               .end            = range->end,
+               .pte            = __pte(0),
+               .handler        = (void *)kvm_null_fn,
+               .on_lock        = kvm_dec_notifier_count,
+               .flush_on_ret   = false,
+               .may_block      = mmu_notifier_range_blockable(range),
+       };
+
+       __kvm_handle_hva_range(kvm, &hva_range);
  
         BUG_ON(kvm->mmu_notifier_count < 0);
  }
@@ -527,20 +704,9 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
                                               unsigned long start,
                                               unsigned long end)
  {
-       struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int young, idx;
-
-       idx = srcu_read_lock(&kvm->srcu);
-       KVM_MMU_LOCK(kvm);
+       trace_kvm_age_hva(start, end);
  
-       young = kvm_age_hva(kvm, start, end);
-       if (young)
-               kvm_flush_remote_tlbs(kvm);
-
-       KVM_MMU_UNLOCK(kvm);
-       srcu_read_unlock(&kvm->srcu, idx);
-
-       return young;
+       return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
  }
  
  static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
@@ -548,11 +714,8 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
                                         unsigned long start,
                                         unsigned long end)
  {
-       struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int young, idx;
+       trace_kvm_age_hva(start, end);
  
-       idx = srcu_read_lock(&kvm->srcu);
-       KVM_MMU_LOCK(kvm);
         /*
          * Even though we do not flush TLB, this will still adversely
          * affect performance on pre-Haswell Intel EPT, where there is
@@ -566,27 +729,17 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
          * cadence. If we find this inaccurate, we might come up with a
          * more sophisticated heuristic later.
          */
-       young = kvm_age_hva(kvm, start, end);
-       KVM_MMU_UNLOCK(kvm);
-       srcu_read_unlock(&kvm->srcu, idx);
-
-       return young;
+       return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
  }
  
  static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
                                        struct mm_struct *mm,
                                        unsigned long address)
  {
-       struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int young, idx;
-
-       idx = srcu_read_lock(&kvm->srcu);
-       KVM_MMU_LOCK(kvm);
-       young = kvm_test_age_hva(kvm, address);
-       KVM_MMU_UNLOCK(kvm);
-       srcu_read_unlock(&kvm->srcu, idx);
+       trace_kvm_test_age_hva(address);
  
-       return young;
+       return kvm_handle_hva_range_no_flush(mn, address, address + 1,
+                                            kvm_test_age_gfn);
  }
  
  static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
@@ -2023,10 +2176,13 @@ exit:
  
  kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
                                bool atomic, bool *async, bool write_fault,
-                              bool *writable)
+                              bool *writable, hva_t *hva)
  {
         unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
  
+       if (hva)
+               *hva = addr;
+
         if (addr == KVM_HVA_ERR_RO_BAD) {
                 if (writable)
                         *writable = false;
@@ -2054,19 +2210,19 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
                       bool *writable)
  {
         return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
-                                   write_fault, writable);
+                                   write_fault, writable, NULL);
  }
  EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
  
  kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
  {
-       return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
+       return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
  }
  EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
  
  kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
  {
-       return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
+       return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
  }
  EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
  
@@ -2981,6 +3137,11 @@ static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
         return false;
  }
  
+bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+       return false;
+}
+
  void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
  {
         struct kvm *kvm = me->kvm;
@@ -3014,7 +3175,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
                             !vcpu_dy_runnable(vcpu))
                                 continue;
                         if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
-                               !kvm_arch_vcpu_in_kernel(vcpu))
+                           !kvm_arch_dy_has_pending_interrupt(vcpu) &&
+                           !kvm_arch_vcpu_in_kernel(vcpu))
                                 continue;
                         if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
                                 continue;
@@ -3161,7 +3323,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
         if (r)
                 goto vcpu_decrement;
  
-       vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+       vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
         if (!vcpu) {
                 r = -ENOMEM;
                 goto vcpu_decrement;
@@ -4041,6 +4203,12 @@ static struct file_operations kvm_vm_fops = {
         KVM_COMPAT(kvm_vm_compat_ioctl),
  };
  
+bool file_is_kvm(struct file *file)
+{
+       return file && file->f_op == &kvm_vm_fops;
+}
+EXPORT_SYMBOL_GPL(file_is_kvm);
+
  static int kvm_dev_ioctl_create_vm(unsigned long type)
  {
         int r;
@@ -4464,24 +4632,26 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
         return 0;
  }
  
-/* Caller must hold slots_lock. */
-void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-                              struct kvm_io_device *dev)
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+                             struct kvm_io_device *dev)
  {
         int i, j;
         struct kvm_io_bus *new_bus, *bus;
  
+       lockdep_assert_held(&kvm->slots_lock);
+
         bus = kvm_get_bus(kvm, bus_idx);
         if (!bus)
-               return;
+               return 0;
  
-       for (i = 0; i < bus->dev_count; i++)
+       for (i = 0; i < bus->dev_count; i++) {
                 if (bus->range[i].dev == dev) {
                         break;
                 }
+       }
  
         if (i == bus->dev_count)
-               return;
+               return 0;
  
         new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
                           GFP_KERNEL_ACCOUNT);
@@ -4490,7 +4660,13 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                 new_bus->dev_count--;
                 memcpy(new_bus->range + i, bus->range + i + 1,
                                 flex_array_size(new_bus, range, new_bus->dev_count - i));
-       } else {
+       }
+
+       rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+       synchronize_srcu_expedited(&kvm->srcu);
+
+       /* Destroy the old bus _after_ installing the (null) bus. */
+       if (!new_bus) {
                 pr_err("kvm: failed to shrink bus, removing it completely\n");
                 for (j = 0; j < bus->dev_count; j++) {
                         if (j == i)
@@ -4499,10 +4675,8 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                 }
         }
  
-       rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
-       synchronize_srcu_expedited(&kvm->srcu);
         kfree(bus);
-       return;
+       return new_bus ? 0 : -ENOMEM;
  }
  
  struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,