Merge branch 'misc.namei' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

[linux-2.6-microblaze.git] / virt / kvm / kvm_main.c
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index 9869598..439d3b4 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -189,16 +189,6 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
         return true;
  }
  
-bool kvm_is_transparent_hugepage(kvm_pfn_t pfn)
-{
-       struct page *page = pfn_to_page(pfn);
-
-       if (!PageTransCompoundMap(page))
-               return false;
-
-       return is_transparent_hugepage(compound_head(page));
-}
-
  /*
   * Switches to specified vcpu, until a matching vcpu_put()
   */
@@ -318,6 +308,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
          */
         long dirty_count = smp_load_acquire(&kvm->tlbs_dirty);
  
+       ++kvm->stat.generic.remote_tlb_flush_requests;
         /*
          * We want to publish modifications to the page tables before reading
          * mode. Pairs with a memory barrier in arch-specific code.
@@ -415,6 +406,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
         vcpu->preempted = false;
         vcpu->ready = false;
         preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
+       vcpu->last_used_slot = 0;
  }
  
  void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -496,17 +488,6 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
  
         idx = srcu_read_lock(&kvm->srcu);
  
-       /* The on_lock() path does not yet support lock elision. */
-       if (!IS_KVM_NULL_FN(range->on_lock)) {
-               locked = true;
-               KVM_MMU_LOCK(kvm);
-
-               range->on_lock(kvm, range->start, range->end);
-
-               if (IS_KVM_NULL_FN(range->handler))
-                       goto out_unlock;
-       }
-
         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
                 slots = __kvm_memslots(kvm, i);
                 kvm_for_each_memslot(slot, slots) {
@@ -538,6 +519,10 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
                         if (!locked) {
                                 locked = true;
                                 KVM_MMU_LOCK(kvm);
+                               if (!IS_KVM_NULL_FN(range->on_lock))
+                                       range->on_lock(kvm, range->start, range->end);
+                               if (IS_KVM_NULL_FN(range->handler))
+                                       break;
                         }
                         ret |= range->handler(kvm, &gfn_range);
                 }
@@ -546,7 +531,6 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
         if (range->flush_on_ret && (ret || kvm->tlbs_dirty))
                 kvm_flush_remote_tlbs(kvm);
  
-out_unlock:
         if (locked)
                 KVM_MMU_UNLOCK(kvm);
  
@@ -604,16 +588,20 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
         trace_kvm_set_spte_hva(address);
  
         /*
-        * .change_pte() must be surrounded by .invalidate_range_{start,end}(),
-        * and so always runs with an elevated notifier count.  This obviates
-        * the need to bump the sequence count.
+        * .change_pte() must be surrounded by .invalidate_range_{start,end}().
+        * If mmu_notifier_count is zero, then no in-progress invalidations,
+        * including this one, found a relevant memslot at start(); rechecking
+        * memslots here is unnecessary.  Note, a false positive (count elevated
+        * by a different invalidation) is sub-optimal but functionally ok.
          */
-       WARN_ON_ONCE(!kvm->mmu_notifier_count);
+       WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
+       if (!READ_ONCE(kvm->mmu_notifier_count))
+               return;
  
         kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
  }
  
-static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
+void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
                                    unsigned long end)
  {
         /*
@@ -658,12 +646,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
  
         trace_kvm_unmap_hva_range(range->start, range->end);
  
+       /*
+        * Prevent memslot modification between range_start() and range_end()
+        * so that conditionally locking provides the same result in both
+        * functions.  Without that guarantee, the mmu_notifier_count
+        * adjustments will be imbalanced.
+        *
+        * Pairs with the decrement in range_end().
+        */
+       spin_lock(&kvm->mn_invalidate_lock);
+       kvm->mn_active_invalidate_count++;
+       spin_unlock(&kvm->mn_invalidate_lock);
+
         __kvm_handle_hva_range(kvm, &hva_range);
  
         return 0;
  }
  
-static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
+void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
                                    unsigned long end)
  {
         /*
@@ -694,9 +694,22 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
                 .flush_on_ret   = false,
                 .may_block      = mmu_notifier_range_blockable(range),
         };
+       bool wake;
  
         __kvm_handle_hva_range(kvm, &hva_range);
  
+       /* Pairs with the increment in range_start(). */
+       spin_lock(&kvm->mn_invalidate_lock);
+       wake = (--kvm->mn_active_invalidate_count == 0);
+       spin_unlock(&kvm->mn_invalidate_lock);
+
+       /*
+        * There can only be one waiter, since the wait happens under
+        * slots_lock.
+        */
+       if (wake)
+               rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
+
         BUG_ON(kvm->mmu_notifier_count < 0);
  }
  
@@ -892,10 +905,12 @@ static void kvm_destroy_vm_debugfs(struct kvm *kvm)
  
  static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
  {
+       static DEFINE_MUTEX(kvm_debugfs_lock);
+       struct dentry *dent;
         char dir_name[ITOA_MAX_LEN * 2];
         struct kvm_stat_data *stat_data;
         const struct _kvm_stats_desc *pdesc;
-       int i;
+       int i, ret;
         int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
                                       kvm_vcpu_stats_header.num_desc;
  
@@ -903,8 +918,20 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
                 return 0;
  
         snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
-       kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir);
+       mutex_lock(&kvm_debugfs_lock);
+       dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
+       if (dent) {
+               pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
+               dput(dent);
+               mutex_unlock(&kvm_debugfs_lock);
+               return 0;
+       }
+       dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
+       mutex_unlock(&kvm_debugfs_lock);
+       if (IS_ERR(dent))
+               return 0;
  
+       kvm->debugfs_dentry = dent;
         kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
                                          sizeof(*kvm->debugfs_stat_data),
                                          GFP_KERNEL_ACCOUNT);
@@ -940,6 +967,13 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
                                     kvm->debugfs_dentry, stat_data,
                                     &stat_fops_per_vm);
         }
+
+       ret = kvm_arch_create_vm_debugfs(kvm);
+       if (ret) {
+               kvm_destroy_vm_debugfs(kvm);
+               return i;
+       }
+
         return 0;
  }
  
@@ -960,6 +994,17 @@ void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
  {
  }
  
+/*
+ * Called after per-vm debugfs created.  When called kvm->debugfs_dentry should
+ * be setup already, so we can create arch-specific debugfs entries under it.
+ * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
+ * a per-arch destroy interface is not needed.
+ */
+int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
+{
+       return 0;
+}
+
  static struct kvm *kvm_create_vm(unsigned long type)
  {
         struct kvm *kvm = kvm_arch_alloc_vm();
@@ -977,6 +1022,9 @@ static struct kvm *kvm_create_vm(unsigned long type)
         mutex_init(&kvm->irq_lock);
         mutex_init(&kvm->slots_lock);
         mutex_init(&kvm->slots_arch_lock);
+       spin_lock_init(&kvm->mn_invalidate_lock);
+       rcuwait_init(&kvm->mn_memslots_update_rcuwait);
+
         INIT_LIST_HEAD(&kvm->devices);
  
         BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
@@ -1099,6 +1147,16 @@ static void kvm_destroy_vm(struct kvm *kvm)
         kvm_coalesced_mmio_free(kvm);
  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
         mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+       /*
+        * At this point, pending calls to invalidate_range_start()
+        * have completed but no more MMU notifiers will run, so
+        * mn_active_invalidate_count may remain unbalanced.
+        * No threads can be waiting in install_new_memslots as the
+        * last reference on KVM has been dropped, but freeing
+        * memslots would deadlock without this manual intervention.
+        */
+       WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
+       kvm->mn_active_invalidate_count = 0;
  #else
         kvm_arch_flush_shadow_all(kvm);
  #endif
@@ -1120,6 +1178,16 @@ void kvm_get_kvm(struct kvm *kvm)
  }
  EXPORT_SYMBOL_GPL(kvm_get_kvm);
  
+/*
+ * Make sure the vm is not during destruction, which is a safe version of
+ * kvm_get_kvm().  Return true if kvm referenced successfully, false otherwise.
+ */
+bool kvm_get_kvm_safe(struct kvm *kvm)
+{
+       return refcount_inc_not_zero(&kvm->users_count);
+}
+EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
+
  void kvm_put_kvm(struct kvm *kvm)
  {
         if (refcount_dec_and_test(&kvm->users_count))
@@ -1180,8 +1248,8 @@ static inline void kvm_memslot_delete(struct kvm_memslots *slots,
  
         slots->used_slots--;
  
-       if (atomic_read(&slots->lru_slot) >= slots->used_slots)
-               atomic_set(&slots->lru_slot, 0);
+       if (atomic_read(&slots->last_used_slot) >= slots->used_slots)
+               atomic_set(&slots->last_used_slot, 0);
  
         for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) {
                 mslots[i] = mslots[i + 1];
@@ -1350,7 +1418,22 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
         WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
         slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
  
+       /*
+        * Do not store the new memslots while there are invalidations in
+        * progress, otherwise the locking in invalidate_range_start and
+        * invalidate_range_end will be unbalanced.
+        */
+       spin_lock(&kvm->mn_invalidate_lock);
+       prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
+       while (kvm->mn_active_invalidate_count) {
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               spin_unlock(&kvm->mn_invalidate_lock);
+               schedule();
+               spin_lock(&kvm->mn_invalidate_lock);
+       }
+       finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
         rcu_assign_pointer(kvm->memslots[as_id], slots);
+       spin_unlock(&kvm->mn_invalidate_lock);
  
         /*
          * Acquired in kvm_set_memslot. Must be released before synchronize
@@ -1966,7 +2049,26 @@ EXPORT_SYMBOL_GPL(gfn_to_memslot);
  
  struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
  {
-       return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
+       struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
+       struct kvm_memory_slot *slot;
+       int slot_index;
+
+       slot = try_get_memslot(slots, vcpu->last_used_slot, gfn);
+       if (slot)
+               return slot;
+
+       /*
+        * Fall back to searching all memslots. We purposely use
+        * search_memslots() instead of __gfn_to_memslot() to avoid
+        * thrashing the VM-wide last_used_index in kvm_memslots.
+        */
+       slot = search_memslots(slots, gfn, &slot_index);
+       if (slot) {
+               vcpu->last_used_slot = slot_index;
+               return slot;
+       }
+
+       return NULL;
  }
  EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
  
@@ -2225,7 +2327,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
          * Get a reference here because callers of *hva_to_pfn* and
          * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
          * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
-        * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
+        * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
          * simply do nothing for reserved pfns.
          *
          * Whoever called remap_pfn_range is also going to call e.g.
@@ -2622,13 +2724,6 @@ void kvm_set_pfn_accessed(kvm_pfn_t pfn)
  }
  EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
  
-void kvm_get_pfn(kvm_pfn_t pfn)
-{
-       if (!kvm_is_reserved_pfn(pfn))
-               get_page(pfn_to_page(pfn));
-}
-EXPORT_SYMBOL_GPL(kvm_get_pfn);
-
  static int next_segment(unsigned long len, int offset)
  {
         if (len > PAGE_SIZE - offset)
@@ -3108,12 +3203,23 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
                                 ++vcpu->stat.generic.halt_successful_poll;
                                 if (!vcpu_valid_wakeup(vcpu))
                                         ++vcpu->stat.generic.halt_poll_invalid;
+
+                               KVM_STATS_LOG_HIST_UPDATE(
+                                     vcpu->stat.generic.halt_poll_success_hist,
+                                     ktime_to_ns(ktime_get()) -
+                                     ktime_to_ns(start));
                                 goto out;
                         }
+                       cpu_relax();
                         poll_end = cur = ktime_get();
                 } while (kvm_vcpu_can_poll(cur, stop));
+
+               KVM_STATS_LOG_HIST_UPDATE(
+                               vcpu->stat.generic.halt_poll_fail_hist,
+                               ktime_to_ns(ktime_get()) - ktime_to_ns(start));
         }
  
+
         prepare_to_rcuwait(&vcpu->wait);
         for (;;) {
                 set_current_state(TASK_INTERRUPTIBLE);
@@ -3126,6 +3232,12 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
         }
         finish_rcuwait(&vcpu->wait);
         cur = ktime_get();
+       if (waited) {
+               vcpu->stat.generic.halt_wait_ns +=
+                       ktime_to_ns(cur) - ktime_to_ns(poll_end);
+               KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
+                               ktime_to_ns(cur) - ktime_to_ns(poll_end));
+       }
  out:
         kvm_arch_vcpu_unblocking(vcpu);
         block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
@@ -3597,7 +3709,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
         struct kvm_fpu *fpu = NULL;
         struct kvm_sregs *kvm_sregs = NULL;
  
-       if (vcpu->kvm->mm != current->mm)
+       if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
                 return -EIO;
  
         if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
@@ -3807,7 +3919,7 @@ static long kvm_vcpu_compat_ioctl(struct file *filp,
         void __user *argp = compat_ptr(arg);
         int r;
  
-       if (vcpu->kvm->mm != current->mm)
+       if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
                 return -EIO;
  
         switch (ioctl) {
@@ -3873,7 +3985,7 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
  {
         struct kvm_device *dev = filp->private_data;
  
-       if (dev->kvm->mm != current->mm)
+       if (dev->kvm->mm != current->mm || dev->kvm->vm_bugged)
                 return -EIO;
  
         switch (ioctl) {
@@ -4195,7 +4307,7 @@ static long kvm_vm_ioctl(struct file *filp,
         void __user *argp = (void __user *)arg;
         int r;
  
-       if (kvm->mm != current->mm)
+       if (kvm->mm != current->mm || kvm->vm_bugged)
                 return -EIO;
         switch (ioctl) {
         case KVM_CREATE_VCPU:
@@ -4390,15 +4502,43 @@ struct compat_kvm_dirty_log {
         };
  };
  
+struct compat_kvm_clear_dirty_log {
+       __u32 slot;
+       __u32 num_pages;
+       __u64 first_page;
+       union {
+               compat_uptr_t dirty_bitmap; /* one bit per page */
+               __u64 padding2;
+       };
+};
+
  static long kvm_vm_compat_ioctl(struct file *filp,
                            unsigned int ioctl, unsigned long arg)
  {
         struct kvm *kvm = filp->private_data;
         int r;
  
-       if (kvm->mm != current->mm)
+       if (kvm->mm != current->mm || kvm->vm_bugged)
                 return -EIO;
         switch (ioctl) {
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+       case KVM_CLEAR_DIRTY_LOG: {
+               struct compat_kvm_clear_dirty_log compat_log;
+               struct kvm_clear_dirty_log log;
+
+               if (copy_from_user(&compat_log, (void __user *)arg,
+                                  sizeof(compat_log)))
+                       return -EFAULT;
+               log.slot         = compat_log.slot;
+               log.num_pages    = compat_log.num_pages;
+               log.first_page   = compat_log.first_page;
+               log.padding2     = compat_log.padding2;
+               log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
+
+               r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
+               break;
+       }
+#endif
         case KVM_GET_DIRTY_LOG: {
                 struct compat_kvm_dirty_log compat_log;
                 struct kvm_dirty_log log;
@@ -4940,12 +5080,12 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file,
         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
                                           inode->i_private;
  
-       /* The debugfs files are a reference to the kvm struct which
-        * is still valid when kvm_destroy_vm is called.
-        * To avoid the race between open and the removal of the debugfs
-        * directory we test against the users count.
+       /*
+        * The debugfs files are a reference to the kvm struct which
+        * is still valid when kvm_destroy_vm is called.  kvm_get_kvm_safe
+        * avoids the race between open and the removal of the debugfs directory.
          */
-       if (!refcount_inc_not_zero(&stat_data->kvm->users_count))
+       if (!kvm_get_kvm_safe(stat_data->kvm))
                 return -ENOENT;
  
         if (simple_attr_open(inode, file, get,
@@ -5172,7 +5312,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
         }
         add_uevent_var(env, "PID=%d", kvm->userspace_pid);
  
-       if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) {
+       if (kvm->debugfs_dentry) {
                 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
  
                 if (p) {