69c318fdff616d43799a30dc7f7f999144767df2
[linux-2.6-microblaze.git] / virt / kvm / kvm_main.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Kernel-based Virtual Machine driver for Linux
4  *
5  * This module enables machines with Intel VT-x extensions to run virtual
6  * machines without emulation or binary translation.
7  *
8  * Copyright (C) 2006 Qumranet, Inc.
9  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10  *
11  * Authors:
12  *   Avi Kivity   <avi@qumranet.com>
13  *   Yaniv Kamay  <yaniv@qumranet.com>
14  */
15
16 #include <kvm/iodev.h>
17
18 #include <linux/kvm_host.h>
19 #include <linux/kvm.h>
20 #include <linux/module.h>
21 #include <linux/errno.h>
22 #include <linux/percpu.h>
23 #include <linux/mm.h>
24 #include <linux/miscdevice.h>
25 #include <linux/vmalloc.h>
26 #include <linux/reboot.h>
27 #include <linux/debugfs.h>
28 #include <linux/highmem.h>
29 #include <linux/file.h>
30 #include <linux/syscore_ops.h>
31 #include <linux/cpu.h>
32 #include <linux/sched/signal.h>
33 #include <linux/sched/mm.h>
34 #include <linux/sched/stat.h>
35 #include <linux/cpumask.h>
36 #include <linux/smp.h>
37 #include <linux/anon_inodes.h>
38 #include <linux/profile.h>
39 #include <linux/kvm_para.h>
40 #include <linux/pagemap.h>
41 #include <linux/mman.h>
42 #include <linux/swap.h>
43 #include <linux/bitops.h>
44 #include <linux/spinlock.h>
45 #include <linux/compat.h>
46 #include <linux/srcu.h>
47 #include <linux/hugetlb.h>
48 #include <linux/slab.h>
49 #include <linux/sort.h>
50 #include <linux/bsearch.h>
51 #include <linux/io.h>
52 #include <linux/lockdep.h>
53 #include <linux/kthread.h>
54 #include <linux/suspend.h>
55
56 #include <asm/processor.h>
57 #include <asm/ioctl.h>
58 #include <linux/uaccess.h>
59
60 #include "coalesced_mmio.h"
61 #include "async_pf.h"
62 #include "kvm_mm.h"
63 #include "vfio.h"
64
65 #define CREATE_TRACE_POINTS
66 #include <trace/events/kvm.h>
67
68 #include <linux/kvm_dirty_ring.h>
69
70 /* Worst case buffer size needed for holding an integer. */
71 #define ITOA_MAX_LEN 12
72
73 MODULE_AUTHOR("Qumranet");
74 MODULE_LICENSE("GPL");
75
76 /* Architectures should define their poll value according to the halt latency */
77 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
78 module_param(halt_poll_ns, uint, 0644);
79 EXPORT_SYMBOL_GPL(halt_poll_ns);
80
81 /* Default doubles per-vcpu halt_poll_ns. */
82 unsigned int halt_poll_ns_grow = 2;
83 module_param(halt_poll_ns_grow, uint, 0644);
84 EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
85
86 /* The start value to grow halt_poll_ns from */
87 unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
88 module_param(halt_poll_ns_grow_start, uint, 0644);
89 EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
90
91 /* Default resets per-vcpu halt_poll_ns . */
92 unsigned int halt_poll_ns_shrink;
93 module_param(halt_poll_ns_shrink, uint, 0644);
94 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
95
96 /*
97  * Ordering of locks:
98  *
99  *      kvm->lock --> kvm->slots_lock --> kvm->irq_lock
100  */
101
102 DEFINE_MUTEX(kvm_lock);
103 static DEFINE_RAW_SPINLOCK(kvm_count_lock);
104 LIST_HEAD(vm_list);
105
106 static cpumask_var_t cpus_hardware_enabled;
107 static int kvm_usage_count;
108 static atomic_t hardware_enable_failed;
109
110 static struct kmem_cache *kvm_vcpu_cache;
111
112 static __read_mostly struct preempt_ops kvm_preempt_ops;
113 static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
114
115 struct dentry *kvm_debugfs_dir;
116 EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
117
118 static const struct file_operations stat_fops_per_vm;
119
120 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
121                            unsigned long arg);
122 #ifdef CONFIG_KVM_COMPAT
123 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
124                                   unsigned long arg);
125 #define KVM_COMPAT(c)   .compat_ioctl   = (c)
126 #else
127 /*
128  * For architectures that don't implement a compat infrastructure,
129  * adopt a double line of defense:
130  * - Prevent a compat task from opening /dev/kvm
131  * - If the open has been done by a 64bit task, and the KVM fd
132  *   passed to a compat task, let the ioctls fail.
133  */
134 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
135                                 unsigned long arg) { return -EINVAL; }
136
137 static int kvm_no_compat_open(struct inode *inode, struct file *file)
138 {
139         return is_compat_task() ? -ENODEV : 0;
140 }
141 #define KVM_COMPAT(c)   .compat_ioctl   = kvm_no_compat_ioctl,  \
142                         .open           = kvm_no_compat_open
143 #endif
144 static int hardware_enable_all(void);
145 static void hardware_disable_all(void);
146
147 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
148
149 __visible bool kvm_rebooting;
150 EXPORT_SYMBOL_GPL(kvm_rebooting);
151
152 #define KVM_EVENT_CREATE_VM 0
153 #define KVM_EVENT_DESTROY_VM 1
154 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
155 static unsigned long long kvm_createvm_count;
156 static unsigned long long kvm_active_vms;
157
158 static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
159
160 __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
161                                                    unsigned long start, unsigned long end)
162 {
163 }
164
165 bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
166 {
167         /*
168          * The metadata used by is_zone_device_page() to determine whether or
169          * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
170          * the device has been pinned, e.g. by get_user_pages().  WARN if the
171          * page_count() is zero to help detect bad usage of this helper.
172          */
173         if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
174                 return false;
175
176         return is_zone_device_page(pfn_to_page(pfn));
177 }
178
179 bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
180 {
181         /*
182          * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
183          * perspective they are "normal" pages, albeit with slightly different
184          * usage rules.
185          */
186         if (pfn_valid(pfn))
187                 return PageReserved(pfn_to_page(pfn)) &&
188                        !is_zero_pfn(pfn) &&
189                        !kvm_is_zone_device_pfn(pfn);
190
191         return true;
192 }
193
194 /*
195  * Switches to specified vcpu, until a matching vcpu_put()
196  */
197 void vcpu_load(struct kvm_vcpu *vcpu)
198 {
199         int cpu = get_cpu();
200
201         __this_cpu_write(kvm_running_vcpu, vcpu);
202         preempt_notifier_register(&vcpu->preempt_notifier);
203         kvm_arch_vcpu_load(vcpu, cpu);
204         put_cpu();
205 }
206 EXPORT_SYMBOL_GPL(vcpu_load);
207
208 void vcpu_put(struct kvm_vcpu *vcpu)
209 {
210         preempt_disable();
211         kvm_arch_vcpu_put(vcpu);
212         preempt_notifier_unregister(&vcpu->preempt_notifier);
213         __this_cpu_write(kvm_running_vcpu, NULL);
214         preempt_enable();
215 }
216 EXPORT_SYMBOL_GPL(vcpu_put);
217
218 /* TODO: merge with kvm_arch_vcpu_should_kick */
219 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
220 {
221         int mode = kvm_vcpu_exiting_guest_mode(vcpu);
222
223         /*
224          * We need to wait for the VCPU to reenable interrupts and get out of
225          * READING_SHADOW_PAGE_TABLES mode.
226          */
227         if (req & KVM_REQUEST_WAIT)
228                 return mode != OUTSIDE_GUEST_MODE;
229
230         /*
231          * Need to kick a running VCPU, but otherwise there is nothing to do.
232          */
233         return mode == IN_GUEST_MODE;
234 }
235
236 static void ack_flush(void *_completed)
237 {
238 }
239
240 static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
241 {
242         if (cpumask_empty(cpus))
243                 return false;
244
245         smp_call_function_many(cpus, ack_flush, NULL, wait);
246         return true;
247 }
248
249 static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
250                                   struct cpumask *tmp, int current_cpu)
251 {
252         int cpu;
253
254         kvm_make_request(req, vcpu);
255
256         if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
257                 return;
258
259         /*
260          * Note, the vCPU could get migrated to a different pCPU at any point
261          * after kvm_request_needs_ipi(), which could result in sending an IPI
262          * to the previous pCPU.  But, that's OK because the purpose of the IPI
263          * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
264          * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
265          * after this point is also OK, as the requirement is only that KVM wait
266          * for vCPUs that were reading SPTEs _before_ any changes were
267          * finalized. See kvm_vcpu_kick() for more details on handling requests.
268          */
269         if (kvm_request_needs_ipi(vcpu, req)) {
270                 cpu = READ_ONCE(vcpu->cpu);
271                 if (cpu != -1 && cpu != current_cpu)
272                         __cpumask_set_cpu(cpu, tmp);
273         }
274 }
275
276 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
277                                  unsigned long *vcpu_bitmap)
278 {
279         struct kvm_vcpu *vcpu;
280         struct cpumask *cpus;
281         int i, me;
282         bool called;
283
284         me = get_cpu();
285
286         cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
287         cpumask_clear(cpus);
288
289         for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
290                 vcpu = kvm_get_vcpu(kvm, i);
291                 if (!vcpu)
292                         continue;
293                 kvm_make_vcpu_request(vcpu, req, cpus, me);
294         }
295
296         called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
297         put_cpu();
298
299         return called;
300 }
301
302 bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
303                                       struct kvm_vcpu *except)
304 {
305         struct kvm_vcpu *vcpu;
306         struct cpumask *cpus;
307         unsigned long i;
308         bool called;
309         int me;
310
311         me = get_cpu();
312
313         cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
314         cpumask_clear(cpus);
315
316         kvm_for_each_vcpu(i, vcpu, kvm) {
317                 if (vcpu == except)
318                         continue;
319                 kvm_make_vcpu_request(vcpu, req, cpus, me);
320         }
321
322         called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
323         put_cpu();
324
325         return called;
326 }
327
328 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
329 {
330         return kvm_make_all_cpus_request_except(kvm, req, NULL);
331 }
332 EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
333
334 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
335 void kvm_flush_remote_tlbs(struct kvm *kvm)
336 {
337         ++kvm->stat.generic.remote_tlb_flush_requests;
338
339         /*
340          * We want to publish modifications to the page tables before reading
341          * mode. Pairs with a memory barrier in arch-specific code.
342          * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
343          * and smp_mb in walk_shadow_page_lockless_begin/end.
344          * - powerpc: smp_mb in kvmppc_prepare_to_enter.
345          *
346          * There is already an smp_mb__after_atomic() before
347          * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
348          * barrier here.
349          */
350         if (!kvm_arch_flush_remote_tlb(kvm)
351             || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
352                 ++kvm->stat.generic.remote_tlb_flush;
353 }
354 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
355 #endif
356
357 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
358 static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
359                                                gfp_t gfp_flags)
360 {
361         gfp_flags |= mc->gfp_zero;
362
363         if (mc->kmem_cache)
364                 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
365         else
366                 return (void *)__get_free_page(gfp_flags);
367 }
368
369 int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
370 {
371         void *obj;
372
373         if (mc->nobjs >= min)
374                 return 0;
375         while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
376                 obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
377                 if (!obj)
378                         return mc->nobjs >= min ? 0 : -ENOMEM;
379                 mc->objects[mc->nobjs++] = obj;
380         }
381         return 0;
382 }
383
384 int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
385 {
386         return mc->nobjs;
387 }
388
389 void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
390 {
391         while (mc->nobjs) {
392                 if (mc->kmem_cache)
393                         kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
394                 else
395                         free_page((unsigned long)mc->objects[--mc->nobjs]);
396         }
397 }
398
399 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
400 {
401         void *p;
402
403         if (WARN_ON(!mc->nobjs))
404                 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
405         else
406                 p = mc->objects[--mc->nobjs];
407         BUG_ON(!p);
408         return p;
409 }
410 #endif
411
412 static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
413 {
414         mutex_init(&vcpu->mutex);
415         vcpu->cpu = -1;
416         vcpu->kvm = kvm;
417         vcpu->vcpu_id = id;
418         vcpu->pid = NULL;
419 #ifndef __KVM_HAVE_ARCH_WQP
420         rcuwait_init(&vcpu->wait);
421 #endif
422         kvm_async_pf_vcpu_init(vcpu);
423
424         kvm_vcpu_set_in_spin_loop(vcpu, false);
425         kvm_vcpu_set_dy_eligible(vcpu, false);
426         vcpu->preempted = false;
427         vcpu->ready = false;
428         preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
429         vcpu->last_used_slot = NULL;
430 }
431
432 static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
433 {
434         kvm_dirty_ring_free(&vcpu->dirty_ring);
435         kvm_arch_vcpu_destroy(vcpu);
436
437         /*
438          * No need for rcu_read_lock as VCPU_RUN is the only place that changes
439          * the vcpu->pid pointer, and at destruction time all file descriptors
440          * are already gone.
441          */
442         put_pid(rcu_dereference_protected(vcpu->pid, 1));
443
444         free_page((unsigned long)vcpu->run);
445         kmem_cache_free(kvm_vcpu_cache, vcpu);
446 }
447
448 void kvm_destroy_vcpus(struct kvm *kvm)
449 {
450         unsigned long i;
451         struct kvm_vcpu *vcpu;
452
453         kvm_for_each_vcpu(i, vcpu, kvm) {
454                 kvm_vcpu_destroy(vcpu);
455                 xa_erase(&kvm->vcpu_array, i);
456         }
457
458         atomic_set(&kvm->online_vcpus, 0);
459 }
460 EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
461
462 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
463 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
464 {
465         return container_of(mn, struct kvm, mmu_notifier);
466 }
467
468 static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
469                                               struct mm_struct *mm,
470                                               unsigned long start, unsigned long end)
471 {
472         struct kvm *kvm = mmu_notifier_to_kvm(mn);
473         int idx;
474
475         idx = srcu_read_lock(&kvm->srcu);
476         kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
477         srcu_read_unlock(&kvm->srcu, idx);
478 }
479
480 typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
481
482 typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
483                              unsigned long end);
484
485 struct kvm_hva_range {
486         unsigned long start;
487         unsigned long end;
488         pte_t pte;
489         hva_handler_t handler;
490         on_lock_fn_t on_lock;
491         bool flush_on_ret;
492         bool may_block;
493 };
494
495 /*
496  * Use a dedicated stub instead of NULL to indicate that there is no callback
497  * function/handler.  The compiler technically can't guarantee that a real
498  * function will have a non-zero address, and so it will generate code to
499  * check for !NULL, whereas comparing against a stub will be elided at compile
500  * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
501  */
502 static void kvm_null_fn(void)
503 {
504
505 }
506 #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
507
508 /* Iterate over each memslot intersecting [start, last] (inclusive) range */
509 #define kvm_for_each_memslot_in_hva_range(node, slots, start, last)          \
510         for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
511              node;                                                           \
512              node = interval_tree_iter_next(node, start, last))      \
513
514 static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
515                                                   const struct kvm_hva_range *range)
516 {
517         bool ret = false, locked = false;
518         struct kvm_gfn_range gfn_range;
519         struct kvm_memory_slot *slot;
520         struct kvm_memslots *slots;
521         int i, idx;
522
523         if (WARN_ON_ONCE(range->end <= range->start))
524                 return 0;
525
526         /* A null handler is allowed if and only if on_lock() is provided. */
527         if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
528                          IS_KVM_NULL_FN(range->handler)))
529                 return 0;
530
531         idx = srcu_read_lock(&kvm->srcu);
532
533         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
534                 struct interval_tree_node *node;
535
536                 slots = __kvm_memslots(kvm, i);
537                 kvm_for_each_memslot_in_hva_range(node, slots,
538                                                   range->start, range->end - 1) {
539                         unsigned long hva_start, hva_end;
540
541                         slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
542                         hva_start = max(range->start, slot->userspace_addr);
543                         hva_end = min(range->end, slot->userspace_addr +
544                                                   (slot->npages << PAGE_SHIFT));
545
546                         /*
547                          * To optimize for the likely case where the address
548                          * range is covered by zero or one memslots, don't
549                          * bother making these conditional (to avoid writes on
550                          * the second or later invocation of the handler).
551                          */
552                         gfn_range.pte = range->pte;
553                         gfn_range.may_block = range->may_block;
554
555                         /*
556                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
557                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
558                          */
559                         gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
560                         gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
561                         gfn_range.slot = slot;
562
563                         if (!locked) {
564                                 locked = true;
565                                 KVM_MMU_LOCK(kvm);
566                                 if (!IS_KVM_NULL_FN(range->on_lock))
567                                         range->on_lock(kvm, range->start, range->end);
568                                 if (IS_KVM_NULL_FN(range->handler))
569                                         break;
570                         }
571                         ret |= range->handler(kvm, &gfn_range);
572                 }
573         }
574
575         if (range->flush_on_ret && ret)
576                 kvm_flush_remote_tlbs(kvm);
577
578         if (locked)
579                 KVM_MMU_UNLOCK(kvm);
580
581         srcu_read_unlock(&kvm->srcu, idx);
582
583         /* The notifiers are averse to booleans. :-( */
584         return (int)ret;
585 }
586
587 static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
588                                                 unsigned long start,
589                                                 unsigned long end,
590                                                 pte_t pte,
591                                                 hva_handler_t handler)
592 {
593         struct kvm *kvm = mmu_notifier_to_kvm(mn);
594         const struct kvm_hva_range range = {
595                 .start          = start,
596                 .end            = end,
597                 .pte            = pte,
598                 .handler        = handler,
599                 .on_lock        = (void *)kvm_null_fn,
600                 .flush_on_ret   = true,
601                 .may_block      = false,
602         };
603
604         return __kvm_handle_hva_range(kvm, &range);
605 }
606
607 static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
608                                                          unsigned long start,
609                                                          unsigned long end,
610                                                          hva_handler_t handler)
611 {
612         struct kvm *kvm = mmu_notifier_to_kvm(mn);
613         const struct kvm_hva_range range = {
614                 .start          = start,
615                 .end            = end,
616                 .pte            = __pte(0),
617                 .handler        = handler,
618                 .on_lock        = (void *)kvm_null_fn,
619                 .flush_on_ret   = false,
620                 .may_block      = false,
621         };
622
623         return __kvm_handle_hva_range(kvm, &range);
624 }
625 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
626                                         struct mm_struct *mm,
627                                         unsigned long address,
628                                         pte_t pte)
629 {
630         struct kvm *kvm = mmu_notifier_to_kvm(mn);
631
632         trace_kvm_set_spte_hva(address);
633
634         /*
635          * .change_pte() must be surrounded by .invalidate_range_{start,end}().
636          * If mmu_notifier_count is zero, then no in-progress invalidations,
637          * including this one, found a relevant memslot at start(); rechecking
638          * memslots here is unnecessary.  Note, a false positive (count elevated
639          * by a different invalidation) is sub-optimal but functionally ok.
640          */
641         WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
642         if (!READ_ONCE(kvm->mmu_notifier_count))
643                 return;
644
645         kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
646 }
647
648 void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
649                                    unsigned long end)
650 {
651         /*
652          * The count increase must become visible at unlock time as no
653          * spte can be established without taking the mmu_lock and
654          * count is also read inside the mmu_lock critical section.
655          */
656         kvm->mmu_notifier_count++;
657         if (likely(kvm->mmu_notifier_count == 1)) {
658                 kvm->mmu_notifier_range_start = start;
659                 kvm->mmu_notifier_range_end = end;
660         } else {
661                 /*
662                  * Fully tracking multiple concurrent ranges has dimishing
663                  * returns. Keep things simple and just find the minimal range
664                  * which includes the current and new ranges. As there won't be
665                  * enough information to subtract a range after its invalidate
666                  * completes, any ranges invalidated concurrently will
667                  * accumulate and persist until all outstanding invalidates
668                  * complete.
669                  */
670                 kvm->mmu_notifier_range_start =
671                         min(kvm->mmu_notifier_range_start, start);
672                 kvm->mmu_notifier_range_end =
673                         max(kvm->mmu_notifier_range_end, end);
674         }
675 }
676
677 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
678                                         const struct mmu_notifier_range *range)
679 {
680         struct kvm *kvm = mmu_notifier_to_kvm(mn);
681         const struct kvm_hva_range hva_range = {
682                 .start          = range->start,
683                 .end            = range->end,
684                 .pte            = __pte(0),
685                 .handler        = kvm_unmap_gfn_range,
686                 .on_lock        = kvm_inc_notifier_count,
687                 .flush_on_ret   = true,
688                 .may_block      = mmu_notifier_range_blockable(range),
689         };
690
691         trace_kvm_unmap_hva_range(range->start, range->end);
692
693         /*
694          * Prevent memslot modification between range_start() and range_end()
695          * so that conditionally locking provides the same result in both
696          * functions.  Without that guarantee, the mmu_notifier_count
697          * adjustments will be imbalanced.
698          *
699          * Pairs with the decrement in range_end().
700          */
701         spin_lock(&kvm->mn_invalidate_lock);
702         kvm->mn_active_invalidate_count++;
703         spin_unlock(&kvm->mn_invalidate_lock);
704
705         gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
706                                           hva_range.may_block);
707
708         __kvm_handle_hva_range(kvm, &hva_range);
709
710         return 0;
711 }
712
713 void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
714                                    unsigned long end)
715 {
716         /*
717          * This sequence increase will notify the kvm page fault that
718          * the page that is going to be mapped in the spte could have
719          * been freed.
720          */
721         kvm->mmu_notifier_seq++;
722         smp_wmb();
723         /*
724          * The above sequence increase must be visible before the
725          * below count decrease, which is ensured by the smp_wmb above
726          * in conjunction with the smp_rmb in mmu_notifier_retry().
727          */
728         kvm->mmu_notifier_count--;
729 }
730
731 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
732                                         const struct mmu_notifier_range *range)
733 {
734         struct kvm *kvm = mmu_notifier_to_kvm(mn);
735         const struct kvm_hva_range hva_range = {
736                 .start          = range->start,
737                 .end            = range->end,
738                 .pte            = __pte(0),
739                 .handler        = (void *)kvm_null_fn,
740                 .on_lock        = kvm_dec_notifier_count,
741                 .flush_on_ret   = false,
742                 .may_block      = mmu_notifier_range_blockable(range),
743         };
744         bool wake;
745
746         __kvm_handle_hva_range(kvm, &hva_range);
747
748         /* Pairs with the increment in range_start(). */
749         spin_lock(&kvm->mn_invalidate_lock);
750         wake = (--kvm->mn_active_invalidate_count == 0);
751         spin_unlock(&kvm->mn_invalidate_lock);
752
753         /*
754          * There can only be one waiter, since the wait happens under
755          * slots_lock.
756          */
757         if (wake)
758                 rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
759
760         BUG_ON(kvm->mmu_notifier_count < 0);
761 }
762
763 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
764                                               struct mm_struct *mm,
765                                               unsigned long start,
766                                               unsigned long end)
767 {
768         trace_kvm_age_hva(start, end);
769
770         return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
771 }
772
773 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
774                                         struct mm_struct *mm,
775                                         unsigned long start,
776                                         unsigned long end)
777 {
778         trace_kvm_age_hva(start, end);
779
780         /*
781          * Even though we do not flush TLB, this will still adversely
782          * affect performance on pre-Haswell Intel EPT, where there is
783          * no EPT Access Bit to clear so that we have to tear down EPT
784          * tables instead. If we find this unacceptable, we can always
785          * add a parameter to kvm_age_hva so that it effectively doesn't
786          * do anything on clear_young.
787          *
788          * Also note that currently we never issue secondary TLB flushes
789          * from clear_young, leaving this job up to the regular system
790          * cadence. If we find this inaccurate, we might come up with a
791          * more sophisticated heuristic later.
792          */
793         return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
794 }
795
796 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
797                                        struct mm_struct *mm,
798                                        unsigned long address)
799 {
800         trace_kvm_test_age_hva(address);
801
802         return kvm_handle_hva_range_no_flush(mn, address, address + 1,
803                                              kvm_test_age_gfn);
804 }
805
806 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
807                                      struct mm_struct *mm)
808 {
809         struct kvm *kvm = mmu_notifier_to_kvm(mn);
810         int idx;
811
812         idx = srcu_read_lock(&kvm->srcu);
813         kvm_arch_flush_shadow_all(kvm);
814         srcu_read_unlock(&kvm->srcu, idx);
815 }
816
817 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
818         .invalidate_range       = kvm_mmu_notifier_invalidate_range,
819         .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
820         .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
821         .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
822         .clear_young            = kvm_mmu_notifier_clear_young,
823         .test_young             = kvm_mmu_notifier_test_young,
824         .change_pte             = kvm_mmu_notifier_change_pte,
825         .release                = kvm_mmu_notifier_release,
826 };
827
828 static int kvm_init_mmu_notifier(struct kvm *kvm)
829 {
830         kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
831         return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
832 }
833
834 #else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
835
836 static int kvm_init_mmu_notifier(struct kvm *kvm)
837 {
838         return 0;
839 }
840
841 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
842
843 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
844 static int kvm_pm_notifier_call(struct notifier_block *bl,
845                                 unsigned long state,
846                                 void *unused)
847 {
848         struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
849
850         return kvm_arch_pm_notifier(kvm, state);
851 }
852
853 static void kvm_init_pm_notifier(struct kvm *kvm)
854 {
855         kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
856         /* Suspend KVM before we suspend ftrace, RCU, etc. */
857         kvm->pm_notifier.priority = INT_MAX;
858         register_pm_notifier(&kvm->pm_notifier);
859 }
860
861 static void kvm_destroy_pm_notifier(struct kvm *kvm)
862 {
863         unregister_pm_notifier(&kvm->pm_notifier);
864 }
865 #else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
866 static void kvm_init_pm_notifier(struct kvm *kvm)
867 {
868 }
869
870 static void kvm_destroy_pm_notifier(struct kvm *kvm)
871 {
872 }
873 #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
874
875 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
876 {
877         if (!memslot->dirty_bitmap)
878                 return;
879
880         kvfree(memslot->dirty_bitmap);
881         memslot->dirty_bitmap = NULL;
882 }
883
884 /* This does not remove the slot from struct kvm_memslots data structures */
885 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
886 {
887         kvm_destroy_dirty_bitmap(slot);
888
889         kvm_arch_free_memslot(kvm, slot);
890
891         kfree(slot);
892 }
893
894 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
895 {
896         struct hlist_node *idnode;
897         struct kvm_memory_slot *memslot;
898         int bkt;
899
900         /*
901          * The same memslot objects live in both active and inactive sets,
902          * arbitrarily free using index '1' so the second invocation of this
903          * function isn't operating over a structure with dangling pointers
904          * (even though this function isn't actually touching them).
905          */
906         if (!slots->node_idx)
907                 return;
908
909         hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
910                 kvm_free_memslot(kvm, memslot);
911 }
912
913 static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
914 {
915         switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
916         case KVM_STATS_TYPE_INSTANT:
917                 return 0444;
918         case KVM_STATS_TYPE_CUMULATIVE:
919         case KVM_STATS_TYPE_PEAK:
920         default:
921                 return 0644;
922         }
923 }
924
925
926 static void kvm_destroy_vm_debugfs(struct kvm *kvm)
927 {
928         int i;
929         int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
930                                       kvm_vcpu_stats_header.num_desc;
931
932         if (!kvm->debugfs_dentry)
933                 return;
934
935         debugfs_remove_recursive(kvm->debugfs_dentry);
936
937         if (kvm->debugfs_stat_data) {
938                 for (i = 0; i < kvm_debugfs_num_entries; i++)
939                         kfree(kvm->debugfs_stat_data[i]);
940                 kfree(kvm->debugfs_stat_data);
941         }
942 }
943
944 static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
945 {
946         static DEFINE_MUTEX(kvm_debugfs_lock);
947         struct dentry *dent;
948         char dir_name[ITOA_MAX_LEN * 2];
949         struct kvm_stat_data *stat_data;
950         const struct _kvm_stats_desc *pdesc;
951         int i, ret;
952         int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
953                                       kvm_vcpu_stats_header.num_desc;
954
955         if (!debugfs_initialized())
956                 return 0;
957
958         snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
959         mutex_lock(&kvm_debugfs_lock);
960         dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
961         if (dent) {
962                 pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
963                 dput(dent);
964                 mutex_unlock(&kvm_debugfs_lock);
965                 return 0;
966         }
967         dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
968         mutex_unlock(&kvm_debugfs_lock);
969         if (IS_ERR(dent))
970                 return 0;
971
972         kvm->debugfs_dentry = dent;
973         kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
974                                          sizeof(*kvm->debugfs_stat_data),
975                                          GFP_KERNEL_ACCOUNT);
976         if (!kvm->debugfs_stat_data)
977                 return -ENOMEM;
978
979         for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
980                 pdesc = &kvm_vm_stats_desc[i];
981                 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
982                 if (!stat_data)
983                         return -ENOMEM;
984
985                 stat_data->kvm = kvm;
986                 stat_data->desc = pdesc;
987                 stat_data->kind = KVM_STAT_VM;
988                 kvm->debugfs_stat_data[i] = stat_data;
989                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
990                                     kvm->debugfs_dentry, stat_data,
991                                     &stat_fops_per_vm);
992         }
993
994         for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
995                 pdesc = &kvm_vcpu_stats_desc[i];
996                 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
997                 if (!stat_data)
998                         return -ENOMEM;
999
1000                 stat_data->kvm = kvm;
1001                 stat_data->desc = pdesc;
1002                 stat_data->kind = KVM_STAT_VCPU;
1003                 kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
1004                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1005                                     kvm->debugfs_dentry, stat_data,
1006                                     &stat_fops_per_vm);
1007         }
1008
1009         ret = kvm_arch_create_vm_debugfs(kvm);
1010         if (ret) {
1011                 kvm_destroy_vm_debugfs(kvm);
1012                 return i;
1013         }
1014
1015         return 0;
1016 }
1017
1018 /*
1019  * Called after the VM is otherwise initialized, but just before adding it to
1020  * the vm_list.
1021  */
1022 int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1023 {
1024         return 0;
1025 }
1026
1027 /*
1028  * Called just after removing the VM from the vm_list, but before doing any
1029  * other destruction.
1030  */
1031 void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1032 {
1033 }
1034
1035 /*
1036  * Called after per-vm debugfs created.  When called kvm->debugfs_dentry should
1037  * be setup already, so we can create arch-specific debugfs entries under it.
1038  * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1039  * a per-arch destroy interface is not needed.
1040  */
1041 int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1042 {
1043         return 0;
1044 }
1045
1046 static struct kvm *kvm_create_vm(unsigned long type)
1047 {
1048         struct kvm *kvm = kvm_arch_alloc_vm();
1049         struct kvm_memslots *slots;
1050         int r = -ENOMEM;
1051         int i, j;
1052
1053         if (!kvm)
1054                 return ERR_PTR(-ENOMEM);
1055
1056         KVM_MMU_LOCK_INIT(kvm);
1057         mmgrab(current->mm);
1058         kvm->mm = current->mm;
1059         kvm_eventfd_init(kvm);
1060         mutex_init(&kvm->lock);
1061         mutex_init(&kvm->irq_lock);
1062         mutex_init(&kvm->slots_lock);
1063         mutex_init(&kvm->slots_arch_lock);
1064         spin_lock_init(&kvm->mn_invalidate_lock);
1065         rcuwait_init(&kvm->mn_memslots_update_rcuwait);
1066         xa_init(&kvm->vcpu_array);
1067
1068         INIT_LIST_HEAD(&kvm->gpc_list);
1069         spin_lock_init(&kvm->gpc_lock);
1070
1071         INIT_LIST_HEAD(&kvm->devices);
1072
1073         BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1074
1075         if (init_srcu_struct(&kvm->srcu))
1076                 goto out_err_no_srcu;
1077         if (init_srcu_struct(&kvm->irq_srcu))
1078                 goto out_err_no_irq_srcu;
1079
1080         refcount_set(&kvm->users_count, 1);
1081         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1082                 for (j = 0; j < 2; j++) {
1083                         slots = &kvm->__memslots[i][j];
1084
1085                         atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
1086                         slots->hva_tree = RB_ROOT_CACHED;
1087                         slots->gfn_tree = RB_ROOT;
1088                         hash_init(slots->id_hash);
1089                         slots->node_idx = j;
1090
1091                         /* Generations must be different for each address space. */
1092                         slots->generation = i;
1093                 }
1094
1095                 rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
1096         }
1097
1098         for (i = 0; i < KVM_NR_BUSES; i++) {
1099                 rcu_assign_pointer(kvm->buses[i],
1100                         kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1101                 if (!kvm->buses[i])
1102                         goto out_err_no_arch_destroy_vm;
1103         }
1104
1105         kvm->max_halt_poll_ns = halt_poll_ns;
1106
1107         r = kvm_arch_init_vm(kvm, type);
1108         if (r)
1109                 goto out_err_no_arch_destroy_vm;
1110
1111         r = hardware_enable_all();
1112         if (r)
1113                 goto out_err_no_disable;
1114
1115 #ifdef CONFIG_HAVE_KVM_IRQFD
1116         INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
1117 #endif
1118
1119         r = kvm_init_mmu_notifier(kvm);
1120         if (r)
1121                 goto out_err_no_mmu_notifier;
1122
1123         r = kvm_arch_post_init_vm(kvm);
1124         if (r)
1125                 goto out_err;
1126
1127         mutex_lock(&kvm_lock);
1128         list_add(&kvm->vm_list, &vm_list);
1129         mutex_unlock(&kvm_lock);
1130
1131         preempt_notifier_inc();
1132         kvm_init_pm_notifier(kvm);
1133
1134         return kvm;
1135
1136 out_err:
1137 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1138         if (kvm->mmu_notifier.ops)
1139                 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1140 #endif
1141 out_err_no_mmu_notifier:
1142         hardware_disable_all();
1143 out_err_no_disable:
1144         kvm_arch_destroy_vm(kvm);
1145 out_err_no_arch_destroy_vm:
1146         WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1147         for (i = 0; i < KVM_NR_BUSES; i++)
1148                 kfree(kvm_get_bus(kvm, i));
1149         cleanup_srcu_struct(&kvm->irq_srcu);
1150 out_err_no_irq_srcu:
1151         cleanup_srcu_struct(&kvm->srcu);
1152 out_err_no_srcu:
1153         kvm_arch_free_vm(kvm);
1154         mmdrop(current->mm);
1155         return ERR_PTR(r);
1156 }
1157
1158 static void kvm_destroy_devices(struct kvm *kvm)
1159 {
1160         struct kvm_device *dev, *tmp;
1161
1162         /*
1163          * We do not need to take the kvm->lock here, because nobody else
1164          * has a reference to the struct kvm at this point and therefore
1165          * cannot access the devices list anyhow.
1166          */
1167         list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1168                 list_del(&dev->vm_node);
1169                 dev->ops->destroy(dev);
1170         }
1171 }
1172
1173 static void kvm_destroy_vm(struct kvm *kvm)
1174 {
1175         int i;
1176         struct mm_struct *mm = kvm->mm;
1177
1178         kvm_destroy_pm_notifier(kvm);
1179         kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1180         kvm_destroy_vm_debugfs(kvm);
1181         kvm_arch_sync_events(kvm);
1182         mutex_lock(&kvm_lock);
1183         list_del(&kvm->vm_list);
1184         mutex_unlock(&kvm_lock);
1185         kvm_arch_pre_destroy_vm(kvm);
1186
1187         kvm_free_irq_routing(kvm);
1188         for (i = 0; i < KVM_NR_BUSES; i++) {
1189                 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
1190
1191                 if (bus)
1192                         kvm_io_bus_destroy(bus);
1193                 kvm->buses[i] = NULL;
1194         }
1195         kvm_coalesced_mmio_free(kvm);
1196 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1197         mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
1198         /*
1199          * At this point, pending calls to invalidate_range_start()
1200          * have completed but no more MMU notifiers will run, so
1201          * mn_active_invalidate_count may remain unbalanced.
1202          * No threads can be waiting in install_new_memslots as the
1203          * last reference on KVM has been dropped, but freeing
1204          * memslots would deadlock without this manual intervention.
1205          */
1206         WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1207         kvm->mn_active_invalidate_count = 0;
1208 #else
1209         kvm_arch_flush_shadow_all(kvm);
1210 #endif
1211         kvm_arch_destroy_vm(kvm);
1212         kvm_destroy_devices(kvm);
1213         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1214                 kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
1215                 kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
1216         }
1217         cleanup_srcu_struct(&kvm->irq_srcu);
1218         cleanup_srcu_struct(&kvm->srcu);
1219         kvm_arch_free_vm(kvm);
1220         preempt_notifier_dec();
1221         hardware_disable_all();
1222         mmdrop(mm);
1223 }
1224
1225 void kvm_get_kvm(struct kvm *kvm)
1226 {
1227         refcount_inc(&kvm->users_count);
1228 }
1229 EXPORT_SYMBOL_GPL(kvm_get_kvm);
1230
1231 /*
1232  * Make sure the vm is not during destruction, which is a safe version of
1233  * kvm_get_kvm().  Return true if kvm referenced successfully, false otherwise.
1234  */
1235 bool kvm_get_kvm_safe(struct kvm *kvm)
1236 {
1237         return refcount_inc_not_zero(&kvm->users_count);
1238 }
1239 EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1240
1241 void kvm_put_kvm(struct kvm *kvm)
1242 {
1243         if (refcount_dec_and_test(&kvm->users_count))
1244                 kvm_destroy_vm(kvm);
1245 }
1246 EXPORT_SYMBOL_GPL(kvm_put_kvm);
1247
1248 /*
1249  * Used to put a reference that was taken on behalf of an object associated
1250  * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1251  * of the new file descriptor fails and the reference cannot be transferred to
1252  * its final owner.  In such cases, the caller is still actively using @kvm and
1253  * will fail miserably if the refcount unexpectedly hits zero.
1254  */
1255 void kvm_put_kvm_no_destroy(struct kvm *kvm)
1256 {
1257         WARN_ON(refcount_dec_and_test(&kvm->users_count));
1258 }
1259 EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1260
1261 static int kvm_vm_release(struct inode *inode, struct file *filp)
1262 {
1263         struct kvm *kvm = filp->private_data;
1264
1265         kvm_irqfd_release(kvm);
1266
1267         kvm_put_kvm(kvm);
1268         return 0;
1269 }
1270
1271 /*
1272  * Allocation size is twice as large as the actual dirty bitmap size.
1273  * See kvm_vm_ioctl_get_dirty_log() why this is needed.
1274  */
1275 static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1276 {
1277         unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
1278
1279         memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
1280         if (!memslot->dirty_bitmap)
1281                 return -ENOMEM;
1282
1283         return 0;
1284 }
1285
1286 static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
1287 {
1288         struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
1289         int node_idx_inactive = active->node_idx ^ 1;
1290
1291         return &kvm->__memslots[as_id][node_idx_inactive];
1292 }
1293
1294 /*
1295  * Helper to get the address space ID when one of memslot pointers may be NULL.
1296  * This also serves as a sanity that at least one of the pointers is non-NULL,
1297  * and that their address space IDs don't diverge.
1298  */
1299 static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
1300                                   struct kvm_memory_slot *b)
1301 {
1302         if (WARN_ON_ONCE(!a && !b))
1303                 return 0;
1304
1305         if (!a)
1306                 return b->as_id;
1307         if (!b)
1308                 return a->as_id;
1309
1310         WARN_ON_ONCE(a->as_id != b->as_id);
1311         return a->as_id;
1312 }
1313
1314 static void kvm_insert_gfn_node(struct kvm_memslots *slots,
1315                                 struct kvm_memory_slot *slot)
1316 {
1317         struct rb_root *gfn_tree = &slots->gfn_tree;
1318         struct rb_node **node, *parent;
1319         int idx = slots->node_idx;
1320
1321         parent = NULL;
1322         for (node = &gfn_tree->rb_node; *node; ) {
1323                 struct kvm_memory_slot *tmp;
1324
1325                 tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
1326                 parent = *node;
1327                 if (slot->base_gfn < tmp->base_gfn)
1328                         node = &(*node)->rb_left;
1329                 else if (slot->base_gfn > tmp->base_gfn)
1330                         node = &(*node)->rb_right;
1331                 else
1332                         BUG();
1333         }
1334
1335         rb_link_node(&slot->gfn_node[idx], parent, node);
1336         rb_insert_color(&slot->gfn_node[idx], gfn_tree);
1337 }
1338
1339 static void kvm_erase_gfn_node(struct kvm_memslots *slots,
1340                                struct kvm_memory_slot *slot)
1341 {
1342         rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
1343 }
1344
1345 static void kvm_replace_gfn_node(struct kvm_memslots *slots,
1346                                  struct kvm_memory_slot *old,
1347                                  struct kvm_memory_slot *new)
1348 {
1349         int idx = slots->node_idx;
1350
1351         WARN_ON_ONCE(old->base_gfn != new->base_gfn);
1352
1353         rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
1354                         &slots->gfn_tree);
1355 }
1356
1357 /*
1358  * Replace @old with @new in the inactive memslots.
1359  *
1360  * With NULL @old this simply adds @new.
1361  * With NULL @new this simply removes @old.
1362  *
1363  * If @new is non-NULL its hva_node[slots_idx] range has to be set
1364  * appropriately.
1365  */
1366 static void kvm_replace_memslot(struct kvm *kvm,
1367                                 struct kvm_memory_slot *old,
1368                                 struct kvm_memory_slot *new)
1369 {
1370         int as_id = kvm_memslots_get_as_id(old, new);
1371         struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1372         int idx = slots->node_idx;
1373
1374         if (old) {
1375                 hash_del(&old->id_node[idx]);
1376                 interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
1377
1378                 if ((long)old == atomic_long_read(&slots->last_used_slot))
1379                         atomic_long_set(&slots->last_used_slot, (long)new);
1380
1381                 if (!new) {
1382                         kvm_erase_gfn_node(slots, old);
1383                         return;
1384                 }
1385         }
1386
1387         /*
1388          * Initialize @new's hva range.  Do this even when replacing an @old
1389          * slot, kvm_copy_memslot() deliberately does not touch node data.
1390          */
1391         new->hva_node[idx].start = new->userspace_addr;
1392         new->hva_node[idx].last = new->userspace_addr +
1393                                   (new->npages << PAGE_SHIFT) - 1;
1394
1395         /*
1396          * (Re)Add the new memslot.  There is no O(1) interval_tree_replace(),
1397          * hva_node needs to be swapped with remove+insert even though hva can't
1398          * change when replacing an existing slot.
1399          */
1400         hash_add(slots->id_hash, &new->id_node[idx], new->id);
1401         interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
1402
1403         /*
1404          * If the memslot gfn is unchanged, rb_replace_node() can be used to
1405          * switch the node in the gfn tree instead of removing the old and
1406          * inserting the new as two separate operations. Replacement is a
1407          * single O(1) operation versus two O(log(n)) operations for
1408          * remove+insert.
1409          */
1410         if (old && old->base_gfn == new->base_gfn) {
1411                 kvm_replace_gfn_node(slots, old, new);
1412         } else {
1413                 if (old)
1414                         kvm_erase_gfn_node(slots, old);
1415                 kvm_insert_gfn_node(slots, new);
1416         }
1417 }
1418
1419 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
1420 {
1421         u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1422
1423 #ifdef __KVM_HAVE_READONLY_MEM
1424         valid_flags |= KVM_MEM_READONLY;
1425 #endif
1426
1427         if (mem->flags & ~valid_flags)
1428                 return -EINVAL;
1429
1430         return 0;
1431 }
1432
1433 static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
1434 {
1435         struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1436
1437         /* Grab the generation from the activate memslots. */
1438         u64 gen = __kvm_memslots(kvm, as_id)->generation;
1439
1440         WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1441         slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1442
1443         /*
1444          * Do not store the new memslots while there are invalidations in
1445          * progress, otherwise the locking in invalidate_range_start and
1446          * invalidate_range_end will be unbalanced.
1447          */
1448         spin_lock(&kvm->mn_invalidate_lock);
1449         prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1450         while (kvm->mn_active_invalidate_count) {
1451                 set_current_state(TASK_UNINTERRUPTIBLE);
1452                 spin_unlock(&kvm->mn_invalidate_lock);
1453                 schedule();
1454                 spin_lock(&kvm->mn_invalidate_lock);
1455         }
1456         finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
1457         rcu_assign_pointer(kvm->memslots[as_id], slots);
1458         spin_unlock(&kvm->mn_invalidate_lock);
1459
1460         /*
1461          * Acquired in kvm_set_memslot. Must be released before synchronize
1462          * SRCU below in order to avoid deadlock with another thread
1463          * acquiring the slots_arch_lock in an srcu critical section.
1464          */
1465         mutex_unlock(&kvm->slots_arch_lock);
1466
1467         synchronize_srcu_expedited(&kvm->srcu);
1468
1469         /*
1470          * Increment the new memslot generation a second time, dropping the
1471          * update in-progress flag and incrementing the generation based on
1472          * the number of address spaces.  This provides a unique and easily
1473          * identifiable generation number while the memslots are in flux.
1474          */
1475         gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1476
1477         /*
1478          * Generations must be unique even across address spaces.  We do not need
1479          * a global counter for that, instead the generation space is evenly split
1480          * across address spaces.  For example, with two address spaces, address
1481          * space 0 will use generations 0, 2, 4, ... while address space 1 will
1482          * use generations 1, 3, 5, ...
1483          */
1484         gen += KVM_ADDRESS_SPACE_NUM;
1485
1486         kvm_arch_memslots_updated(kvm, gen);
1487
1488         slots->generation = gen;
1489 }
1490
1491 static int kvm_prepare_memory_region(struct kvm *kvm,
1492                                      const struct kvm_memory_slot *old,
1493                                      struct kvm_memory_slot *new,
1494                                      enum kvm_mr_change change)
1495 {
1496         int r;
1497
1498         /*
1499          * If dirty logging is disabled, nullify the bitmap; the old bitmap
1500          * will be freed on "commit".  If logging is enabled in both old and
1501          * new, reuse the existing bitmap.  If logging is enabled only in the
1502          * new and KVM isn't using a ring buffer, allocate and initialize a
1503          * new bitmap.
1504          */
1505         if (change != KVM_MR_DELETE) {
1506                 if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
1507                         new->dirty_bitmap = NULL;
1508                 else if (old && old->dirty_bitmap)
1509                         new->dirty_bitmap = old->dirty_bitmap;
1510                 else if (!kvm->dirty_ring_size) {
1511                         r = kvm_alloc_dirty_bitmap(new);
1512                         if (r)
1513                                 return r;
1514
1515                         if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1516                                 bitmap_set(new->dirty_bitmap, 0, new->npages);
1517                 }
1518         }
1519
1520         r = kvm_arch_prepare_memory_region(kvm, old, new, change);
1521
1522         /* Free the bitmap on failure if it was allocated above. */
1523         if (r && new && new->dirty_bitmap && old && !old->dirty_bitmap)
1524                 kvm_destroy_dirty_bitmap(new);
1525
1526         return r;
1527 }
1528
1529 static void kvm_commit_memory_region(struct kvm *kvm,
1530                                      struct kvm_memory_slot *old,
1531                                      const struct kvm_memory_slot *new,
1532                                      enum kvm_mr_change change)
1533 {
1534         /*
1535          * Update the total number of memslot pages before calling the arch
1536          * hook so that architectures can consume the result directly.
1537          */
1538         if (change == KVM_MR_DELETE)
1539                 kvm->nr_memslot_pages -= old->npages;
1540         else if (change == KVM_MR_CREATE)
1541                 kvm->nr_memslot_pages += new->npages;
1542
1543         kvm_arch_commit_memory_region(kvm, old, new, change);
1544
1545         switch (change) {
1546         case KVM_MR_CREATE:
1547                 /* Nothing more to do. */
1548                 break;
1549         case KVM_MR_DELETE:
1550                 /* Free the old memslot and all its metadata. */
1551                 kvm_free_memslot(kvm, old);
1552                 break;
1553         case KVM_MR_MOVE:
1554         case KVM_MR_FLAGS_ONLY:
1555                 /*
1556                  * Free the dirty bitmap as needed; the below check encompasses
1557                  * both the flags and whether a ring buffer is being used)
1558                  */
1559                 if (old->dirty_bitmap && !new->dirty_bitmap)
1560                         kvm_destroy_dirty_bitmap(old);
1561
1562                 /*
1563                  * The final quirk.  Free the detached, old slot, but only its
1564                  * memory, not any metadata.  Metadata, including arch specific
1565                  * data, may be reused by @new.
1566                  */
1567                 kfree(old);
1568                 break;
1569         default:
1570                 BUG();
1571         }
1572 }
1573
1574 /*
1575  * Activate @new, which must be installed in the inactive slots by the caller,
1576  * by swapping the active slots and then propagating @new to @old once @old is
1577  * unreachable and can be safely modified.
1578  *
1579  * With NULL @old this simply adds @new to @active (while swapping the sets).
1580  * With NULL @new this simply removes @old from @active and frees it
1581  * (while also swapping the sets).
1582  */
1583 static void kvm_activate_memslot(struct kvm *kvm,
1584                                  struct kvm_memory_slot *old,
1585                                  struct kvm_memory_slot *new)
1586 {
1587         int as_id = kvm_memslots_get_as_id(old, new);
1588
1589         kvm_swap_active_memslots(kvm, as_id);
1590
1591         /* Propagate the new memslot to the now inactive memslots. */
1592         kvm_replace_memslot(kvm, old, new);
1593 }
1594
1595 static void kvm_copy_memslot(struct kvm_memory_slot *dest,
1596                              const struct kvm_memory_slot *src)
1597 {
1598         dest->base_gfn = src->base_gfn;
1599         dest->npages = src->npages;
1600         dest->dirty_bitmap = src->dirty_bitmap;
1601         dest->arch = src->arch;
1602         dest->userspace_addr = src->userspace_addr;
1603         dest->flags = src->flags;
1604         dest->id = src->id;
1605         dest->as_id = src->as_id;
1606 }
1607
1608 static void kvm_invalidate_memslot(struct kvm *kvm,
1609                                    struct kvm_memory_slot *old,
1610                                    struct kvm_memory_slot *invalid_slot)
1611 {
1612         /*
1613          * Mark the current slot INVALID.  As with all memslot modifications,
1614          * this must be done on an unreachable slot to avoid modifying the
1615          * current slot in the active tree.
1616          */
1617         kvm_copy_memslot(invalid_slot, old);
1618         invalid_slot->flags |= KVM_MEMSLOT_INVALID;
1619         kvm_replace_memslot(kvm, old, invalid_slot);
1620
1621         /*
1622          * Activate the slot that is now marked INVALID, but don't propagate
1623          * the slot to the now inactive slots. The slot is either going to be
1624          * deleted or recreated as a new slot.
1625          */
1626         kvm_swap_active_memslots(kvm, old->as_id);
1627
1628         /*
1629          * From this point no new shadow pages pointing to a deleted, or moved,
1630          * memslot will be created.  Validation of sp->gfn happens in:
1631          *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1632          *      - kvm_is_visible_gfn (mmu_check_root)
1633          */
1634         kvm_arch_flush_shadow_memslot(kvm, old);
1635
1636         /* Was released by kvm_swap_active_memslots, reacquire. */
1637         mutex_lock(&kvm->slots_arch_lock);
1638
1639         /*
1640          * Copy the arch-specific field of the newly-installed slot back to the
1641          * old slot as the arch data could have changed between releasing
1642          * slots_arch_lock in install_new_memslots() and re-acquiring the lock
1643          * above.  Writers are required to retrieve memslots *after* acquiring
1644          * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
1645          */
1646         old->arch = invalid_slot->arch;
1647 }
1648
1649 static void kvm_create_memslot(struct kvm *kvm,
1650                                struct kvm_memory_slot *new)
1651 {
1652         /* Add the new memslot to the inactive set and activate. */
1653         kvm_replace_memslot(kvm, NULL, new);
1654         kvm_activate_memslot(kvm, NULL, new);
1655 }
1656
1657 static void kvm_delete_memslot(struct kvm *kvm,
1658                                struct kvm_memory_slot *old,
1659                                struct kvm_memory_slot *invalid_slot)
1660 {
1661         /*
1662          * Remove the old memslot (in the inactive memslots) by passing NULL as
1663          * the "new" slot, and for the invalid version in the active slots.
1664          */
1665         kvm_replace_memslot(kvm, old, NULL);
1666         kvm_activate_memslot(kvm, invalid_slot, NULL);
1667 }
1668
1669 static void kvm_move_memslot(struct kvm *kvm,
1670                              struct kvm_memory_slot *old,
1671                              struct kvm_memory_slot *new,
1672                              struct kvm_memory_slot *invalid_slot)
1673 {
1674         /*
1675          * Replace the old memslot in the inactive slots, and then swap slots
1676          * and replace the current INVALID with the new as well.
1677          */
1678         kvm_replace_memslot(kvm, old, new);
1679         kvm_activate_memslot(kvm, invalid_slot, new);
1680 }
1681
1682 static void kvm_update_flags_memslot(struct kvm *kvm,
1683                                      struct kvm_memory_slot *old,
1684                                      struct kvm_memory_slot *new)
1685 {
1686         /*
1687          * Similar to the MOVE case, but the slot doesn't need to be zapped as
1688          * an intermediate step. Instead, the old memslot is simply replaced
1689          * with a new, updated copy in both memslot sets.
1690          */
1691         kvm_replace_memslot(kvm, old, new);
1692         kvm_activate_memslot(kvm, old, new);
1693 }
1694
1695 static int kvm_set_memslot(struct kvm *kvm,
1696                            struct kvm_memory_slot *old,
1697                            struct kvm_memory_slot *new,
1698                            enum kvm_mr_change change)
1699 {
1700         struct kvm_memory_slot *invalid_slot;
1701         int r;
1702
1703         /*
1704          * Released in kvm_swap_active_memslots.
1705          *
1706          * Must be held from before the current memslots are copied until
1707          * after the new memslots are installed with rcu_assign_pointer,
1708          * then released before the synchronize srcu in kvm_swap_active_memslots.
1709          *
1710          * When modifying memslots outside of the slots_lock, must be held
1711          * before reading the pointer to the current memslots until after all
1712          * changes to those memslots are complete.
1713          *
1714          * These rules ensure that installing new memslots does not lose
1715          * changes made to the previous memslots.
1716          */
1717         mutex_lock(&kvm->slots_arch_lock);
1718
1719         /*
1720          * Invalidate the old slot if it's being deleted or moved.  This is
1721          * done prior to actually deleting/moving the memslot to allow vCPUs to
1722          * continue running by ensuring there are no mappings or shadow pages
1723          * for the memslot when it is deleted/moved.  Without pre-invalidation
1724          * (and without a lock), a window would exist between effecting the
1725          * delete/move and committing the changes in arch code where KVM or a
1726          * guest could access a non-existent memslot.
1727          *
1728          * Modifications are done on a temporary, unreachable slot.  The old
1729          * slot needs to be preserved in case a later step fails and the
1730          * invalidation needs to be reverted.
1731          */
1732         if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1733                 invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
1734                 if (!invalid_slot) {
1735                         mutex_unlock(&kvm->slots_arch_lock);
1736                         return -ENOMEM;
1737                 }
1738                 kvm_invalidate_memslot(kvm, old, invalid_slot);
1739         }
1740
1741         r = kvm_prepare_memory_region(kvm, old, new, change);
1742         if (r) {
1743                 /*
1744                  * For DELETE/MOVE, revert the above INVALID change.  No
1745                  * modifications required since the original slot was preserved
1746                  * in the inactive slots.  Changing the active memslots also
1747                  * release slots_arch_lock.
1748                  */
1749                 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1750                         kvm_activate_memslot(kvm, invalid_slot, old);
1751                         kfree(invalid_slot);
1752                 } else {
1753                         mutex_unlock(&kvm->slots_arch_lock);
1754                 }
1755                 return r;
1756         }
1757
1758         /*
1759          * For DELETE and MOVE, the working slot is now active as the INVALID
1760          * version of the old slot.  MOVE is particularly special as it reuses
1761          * the old slot and returns a copy of the old slot (in working_slot).
1762          * For CREATE, there is no old slot.  For DELETE and FLAGS_ONLY, the
1763          * old slot is detached but otherwise preserved.
1764          */
1765         if (change == KVM_MR_CREATE)
1766                 kvm_create_memslot(kvm, new);
1767         else if (change == KVM_MR_DELETE)
1768                 kvm_delete_memslot(kvm, old, invalid_slot);
1769         else if (change == KVM_MR_MOVE)
1770                 kvm_move_memslot(kvm, old, new, invalid_slot);
1771         else if (change == KVM_MR_FLAGS_ONLY)
1772                 kvm_update_flags_memslot(kvm, old, new);
1773         else
1774                 BUG();
1775
1776         /* Free the temporary INVALID slot used for DELETE and MOVE. */
1777         if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1778                 kfree(invalid_slot);
1779
1780         /*
1781          * No need to refresh new->arch, changes after dropping slots_arch_lock
1782          * will directly hit the final, active memsot.  Architectures are
1783          * responsible for knowing that new->arch may be stale.
1784          */
1785         kvm_commit_memory_region(kvm, old, new, change);
1786
1787         return 0;
1788 }
1789
1790 static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
1791                                       gfn_t start, gfn_t end)
1792 {
1793         struct kvm_memslot_iter iter;
1794
1795         kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
1796                 if (iter.slot->id != id)
1797                         return true;
1798         }
1799
1800         return false;
1801 }
1802
1803 /*
1804  * Allocate some memory and give it an address in the guest physical address
1805  * space.
1806  *
1807  * Discontiguous memory is allowed, mostly for framebuffers.
1808  *
1809  * Must be called holding kvm->slots_lock for write.
1810  */
1811 int __kvm_set_memory_region(struct kvm *kvm,
1812                             const struct kvm_userspace_memory_region *mem)
1813 {
1814         struct kvm_memory_slot *old, *new;
1815         struct kvm_memslots *slots;
1816         enum kvm_mr_change change;
1817         unsigned long npages;
1818         gfn_t base_gfn;
1819         int as_id, id;
1820         int r;
1821
1822         r = check_memory_region_flags(mem);
1823         if (r)
1824                 return r;
1825
1826         as_id = mem->slot >> 16;
1827         id = (u16)mem->slot;
1828
1829         /* General sanity checks */
1830         if ((mem->memory_size & (PAGE_SIZE - 1)) ||
1831             (mem->memory_size != (unsigned long)mem->memory_size))
1832                 return -EINVAL;
1833         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1834                 return -EINVAL;
1835         /* We can read the guest memory with __xxx_user() later on. */
1836         if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1837             (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
1838              !access_ok((void __user *)(unsigned long)mem->userspace_addr,
1839                         mem->memory_size))
1840                 return -EINVAL;
1841         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
1842                 return -EINVAL;
1843         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1844                 return -EINVAL;
1845         if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
1846                 return -EINVAL;
1847
1848         slots = __kvm_memslots(kvm, as_id);
1849
1850         /*
1851          * Note, the old memslot (and the pointer itself!) may be invalidated
1852          * and/or destroyed by kvm_set_memslot().
1853          */
1854         old = id_to_memslot(slots, id);
1855
1856         if (!mem->memory_size) {
1857                 if (!old || !old->npages)
1858                         return -EINVAL;
1859
1860                 if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
1861                         return -EIO;
1862
1863                 return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
1864         }
1865
1866         base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
1867         npages = (mem->memory_size >> PAGE_SHIFT);
1868
1869         if (!old || !old->npages) {
1870                 change = KVM_MR_CREATE;
1871
1872                 /*
1873                  * To simplify KVM internals, the total number of pages across
1874                  * all memslots must fit in an unsigned long.
1875                  */
1876                 if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
1877                         return -EINVAL;
1878         } else { /* Modify an existing slot. */
1879                 if ((mem->userspace_addr != old->userspace_addr) ||
1880                     (npages != old->npages) ||
1881                     ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
1882                         return -EINVAL;
1883
1884                 if (base_gfn != old->base_gfn)
1885                         change = KVM_MR_MOVE;
1886                 else if (mem->flags != old->flags)
1887                         change = KVM_MR_FLAGS_ONLY;
1888                 else /* Nothing to change. */
1889                         return 0;
1890         }
1891
1892         if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
1893             kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
1894                 return -EEXIST;
1895
1896         /* Allocate a slot that will persist in the memslot. */
1897         new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
1898         if (!new)
1899                 return -ENOMEM;
1900
1901         new->as_id = as_id;
1902         new->id = id;
1903         new->base_gfn = base_gfn;
1904         new->npages = npages;
1905         new->flags = mem->flags;
1906         new->userspace_addr = mem->userspace_addr;
1907
1908         r = kvm_set_memslot(kvm, old, new, change);
1909         if (r)
1910                 kfree(new);
1911         return r;
1912 }
1913 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
1914
1915 int kvm_set_memory_region(struct kvm *kvm,
1916                           const struct kvm_userspace_memory_region *mem)
1917 {
1918         int r;
1919
1920         mutex_lock(&kvm->slots_lock);
1921         r = __kvm_set_memory_region(kvm, mem);
1922         mutex_unlock(&kvm->slots_lock);
1923         return r;
1924 }
1925 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
1926
1927 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
1928                                           struct kvm_userspace_memory_region *mem)
1929 {
1930         if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
1931                 return -EINVAL;
1932
1933         return kvm_set_memory_region(kvm, mem);
1934 }
1935
1936 #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1937 /**
1938  * kvm_get_dirty_log - get a snapshot of dirty pages
1939  * @kvm:        pointer to kvm instance
1940  * @log:        slot id and address to which we copy the log
1941  * @is_dirty:   set to '1' if any dirty pages were found
1942  * @memslot:    set to the associated memslot, always valid on success
1943  */
1944 int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
1945                       int *is_dirty, struct kvm_memory_slot **memslot)
1946 {
1947         struct kvm_memslots *slots;
1948         int i, as_id, id;
1949         unsigned long n;
1950         unsigned long any = 0;
1951
1952         /* Dirty ring tracking is exclusive to dirty log tracking */
1953         if (kvm->dirty_ring_size)
1954                 return -ENXIO;
1955
1956         *memslot = NULL;
1957         *is_dirty = 0;
1958
1959         as_id = log->slot >> 16;
1960         id = (u16)log->slot;
1961         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1962                 return -EINVAL;
1963
1964         slots = __kvm_memslots(kvm, as_id);
1965         *memslot = id_to_memslot(slots, id);
1966         if (!(*memslot) || !(*memslot)->dirty_bitmap)
1967                 return -ENOENT;
1968
1969         kvm_arch_sync_dirty_log(kvm, *memslot);
1970
1971         n = kvm_dirty_bitmap_bytes(*memslot);
1972
1973         for (i = 0; !any && i < n/sizeof(long); ++i)
1974                 any = (*memslot)->dirty_bitmap[i];
1975
1976         if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
1977                 return -EFAULT;
1978
1979         if (any)
1980                 *is_dirty = 1;
1981         return 0;
1982 }
1983 EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1984
1985 #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
1986 /**
1987  * kvm_get_dirty_log_protect - get a snapshot of dirty pages
1988  *      and reenable dirty page tracking for the corresponding pages.
1989  * @kvm:        pointer to kvm instance
1990  * @log:        slot id and address to which we copy the log
1991  *
1992  * We need to keep it in mind that VCPU threads can write to the bitmap
1993  * concurrently. So, to avoid losing track of dirty pages we keep the
1994  * following order:
1995  *
1996  *    1. Take a snapshot of the bit and clear it if needed.
1997  *    2. Write protect the corresponding page.
1998  *    3. Copy the snapshot to the userspace.
1999  *    4. Upon return caller flushes TLB's if needed.
2000  *
2001  * Between 2 and 4, the guest may write to the page using the remaining TLB
2002  * entry.  This is not a problem because the page is reported dirty using
2003  * the snapshot taken before and step 4 ensures that writes done after
2004  * exiting to userspace will be logged for the next call.
2005  *
2006  */
2007 static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
2008 {
2009         struct kvm_memslots *slots;
2010         struct kvm_memory_slot *memslot;
2011         int i, as_id, id;
2012         unsigned long n;
2013         unsigned long *dirty_bitmap;
2014         unsigned long *dirty_bitmap_buffer;
2015         bool flush;
2016
2017         /* Dirty ring tracking is exclusive to dirty log tracking */
2018         if (kvm->dirty_ring_size)
2019                 return -ENXIO;
2020
2021         as_id = log->slot >> 16;
2022         id = (u16)log->slot;
2023         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2024                 return -EINVAL;
2025
2026         slots = __kvm_memslots(kvm, as_id);
2027         memslot = id_to_memslot(slots, id);
2028         if (!memslot || !memslot->dirty_bitmap)
2029                 return -ENOENT;
2030
2031         dirty_bitmap = memslot->dirty_bitmap;
2032
2033         kvm_arch_sync_dirty_log(kvm, memslot);
2034
2035         n = kvm_dirty_bitmap_bytes(memslot);
2036         flush = false;
2037         if (kvm->manual_dirty_log_protect) {
2038                 /*
2039                  * Unlike kvm_get_dirty_log, we always return false in *flush,
2040                  * because no flush is needed until KVM_CLEAR_DIRTY_LOG.  There
2041                  * is some code duplication between this function and
2042                  * kvm_get_dirty_log, but hopefully all architecture
2043                  * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
2044                  * can be eliminated.
2045                  */
2046                 dirty_bitmap_buffer = dirty_bitmap;
2047         } else {
2048                 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2049                 memset(dirty_bitmap_buffer, 0, n);
2050
2051                 KVM_MMU_LOCK(kvm);
2052                 for (i = 0; i < n / sizeof(long); i++) {
2053                         unsigned long mask;
2054                         gfn_t offset;
2055
2056                         if (!dirty_bitmap[i])
2057                                 continue;
2058
2059                         flush = true;
2060                         mask = xchg(&dirty_bitmap[i], 0);
2061                         dirty_bitmap_buffer[i] = mask;
2062
2063                         offset = i * BITS_PER_LONG;
2064                         kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2065                                                                 offset, mask);
2066                 }
2067                 KVM_MMU_UNLOCK(kvm);
2068         }
2069
2070         if (flush)
2071                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
2072
2073         if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
2074                 return -EFAULT;
2075         return 0;
2076 }
2077
2078
2079 /**
2080  * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
2081  * @kvm: kvm instance
2082  * @log: slot id and address to which we copy the log
2083  *
2084  * Steps 1-4 below provide general overview of dirty page logging. See
2085  * kvm_get_dirty_log_protect() function description for additional details.
2086  *
2087  * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
2088  * always flush the TLB (step 4) even if previous step failed  and the dirty
2089  * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
2090  * does not preclude user space subsequent dirty log read. Flushing TLB ensures
2091  * writes will be marked dirty for next log read.
2092  *
2093  *   1. Take a snapshot of the bit and clear it if needed.
2094  *   2. Write protect the corresponding page.
2095  *   3. Copy the snapshot to the userspace.
2096  *   4. Flush TLB's if needed.
2097  */
2098 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2099                                       struct kvm_dirty_log *log)
2100 {
2101         int r;
2102
2103         mutex_lock(&kvm->slots_lock);
2104
2105         r = kvm_get_dirty_log_protect(kvm, log);
2106
2107         mutex_unlock(&kvm->slots_lock);
2108         return r;
2109 }
2110
2111 /**
2112  * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
2113  *      and reenable dirty page tracking for the corresponding pages.
2114  * @kvm:        pointer to kvm instance
2115  * @log:        slot id and address from which to fetch the bitmap of dirty pages
2116  */
2117 static int kvm_clear_dirty_log_protect(struct kvm *kvm,
2118                                        struct kvm_clear_dirty_log *log)
2119 {
2120         struct kvm_memslots *slots;
2121         struct kvm_memory_slot *memslot;
2122         int as_id, id;
2123         gfn_t offset;
2124         unsigned long i, n;
2125         unsigned long *dirty_bitmap;
2126         unsigned long *dirty_bitmap_buffer;
2127         bool flush;
2128
2129         /* Dirty ring tracking is exclusive to dirty log tracking */
2130         if (kvm->dirty_ring_size)
2131                 return -ENXIO;
2132
2133         as_id = log->slot >> 16;
2134         id = (u16)log->slot;
2135         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2136                 return -EINVAL;
2137
2138         if (log->first_page & 63)
2139                 return -EINVAL;
2140
2141         slots = __kvm_memslots(kvm, as_id);
2142         memslot = id_to_memslot(slots, id);
2143         if (!memslot || !memslot->dirty_bitmap)
2144                 return -ENOENT;
2145
2146         dirty_bitmap = memslot->dirty_bitmap;
2147
2148         n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
2149
2150         if (log->first_page > memslot->npages ||
2151             log->num_pages > memslot->npages - log->first_page ||
2152             (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2153             return -EINVAL;
2154
2155         kvm_arch_sync_dirty_log(kvm, memslot);
2156
2157         flush = false;
2158         dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2159         if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2160                 return -EFAULT;
2161
2162         KVM_MMU_LOCK(kvm);
2163         for (offset = log->first_page, i = offset / BITS_PER_LONG,
2164                  n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2165              i++, offset += BITS_PER_LONG) {
2166                 unsigned long mask = *dirty_bitmap_buffer++;
2167                 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2168                 if (!mask)
2169                         continue;
2170
2171                 mask &= atomic_long_fetch_andnot(mask, p);
2172
2173                 /*
2174                  * mask contains the bits that really have been cleared.  This
2175                  * never includes any bits beyond the length of the memslot (if
2176                  * the length is not aligned to 64 pages), therefore it is not
2177                  * a problem if userspace sets them in log->dirty_bitmap.
2178                 */
2179                 if (mask) {
2180                         flush = true;
2181                         kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2182                                                                 offset, mask);
2183                 }
2184         }
2185         KVM_MMU_UNLOCK(kvm);
2186
2187         if (flush)
2188                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
2189
2190         return 0;
2191 }
2192
2193 static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2194                                         struct kvm_clear_dirty_log *log)
2195 {
2196         int r;
2197
2198         mutex_lock(&kvm->slots_lock);
2199
2200         r = kvm_clear_dirty_log_protect(kvm, log);
2201
2202         mutex_unlock(&kvm->slots_lock);
2203         return r;
2204 }
2205 #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2206
2207 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2208 {
2209         return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2210 }
2211 EXPORT_SYMBOL_GPL(gfn_to_memslot);
2212
2213 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2214 {
2215         struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2216         u64 gen = slots->generation;
2217         struct kvm_memory_slot *slot;
2218
2219         /*
2220          * This also protects against using a memslot from a different address space,
2221          * since different address spaces have different generation numbers.
2222          */
2223         if (unlikely(gen != vcpu->last_used_slot_gen)) {
2224                 vcpu->last_used_slot = NULL;
2225                 vcpu->last_used_slot_gen = gen;
2226         }
2227
2228         slot = try_get_memslot(vcpu->last_used_slot, gfn);
2229         if (slot)
2230                 return slot;
2231
2232         /*
2233          * Fall back to searching all memslots. We purposely use
2234          * search_memslots() instead of __gfn_to_memslot() to avoid
2235          * thrashing the VM-wide last_used_slot in kvm_memslots.
2236          */
2237         slot = search_memslots(slots, gfn, false);
2238         if (slot) {
2239                 vcpu->last_used_slot = slot;
2240                 return slot;
2241         }
2242
2243         return NULL;
2244 }
2245
2246 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
2247 {
2248         struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
2249
2250         return kvm_is_visible_memslot(memslot);
2251 }
2252 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2253
2254 bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2255 {
2256         struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2257
2258         return kvm_is_visible_memslot(memslot);
2259 }
2260 EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2261
2262 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
2263 {
2264         struct vm_area_struct *vma;
2265         unsigned long addr, size;
2266
2267         size = PAGE_SIZE;
2268
2269         addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
2270         if (kvm_is_error_hva(addr))
2271                 return PAGE_SIZE;
2272
2273         mmap_read_lock(current->mm);
2274         vma = find_vma(current->mm, addr);
2275         if (!vma)
2276                 goto out;
2277
2278         size = vma_kernel_pagesize(vma);
2279
2280 out:
2281         mmap_read_unlock(current->mm);
2282
2283         return size;
2284 }
2285
2286 static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
2287 {
2288         return slot->flags & KVM_MEM_READONLY;
2289 }
2290
2291 static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
2292                                        gfn_t *nr_pages, bool write)
2293 {
2294         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2295                 return KVM_HVA_ERR_BAD;
2296
2297         if (memslot_is_readonly(slot) && write)
2298                 return KVM_HVA_ERR_RO_BAD;
2299
2300         if (nr_pages)
2301                 *nr_pages = slot->npages - (gfn - slot->base_gfn);
2302
2303         return __gfn_to_hva_memslot(slot, gfn);
2304 }
2305
2306 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2307                                      gfn_t *nr_pages)
2308 {
2309         return __gfn_to_hva_many(slot, gfn, nr_pages, true);
2310 }
2311
2312 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
2313                                         gfn_t gfn)
2314 {
2315         return gfn_to_hva_many(slot, gfn, NULL);
2316 }
2317 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2318
2319 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2320 {
2321         return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
2322 }
2323 EXPORT_SYMBOL_GPL(gfn_to_hva);
2324
2325 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2326 {
2327         return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2328 }
2329 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2330
2331 /*
2332  * Return the hva of a @gfn and the R/W attribute if possible.
2333  *
2334  * @slot: the kvm_memory_slot which contains @gfn
2335  * @gfn: the gfn to be translated
2336  * @writable: used to return the read/write attribute of the @slot if the hva
2337  * is valid and @writable is not NULL
2338  */
2339 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2340                                       gfn_t gfn, bool *writable)
2341 {
2342         unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2343
2344         if (!kvm_is_error_hva(hva) && writable)
2345                 *writable = !memslot_is_readonly(slot);
2346
2347         return hva;
2348 }
2349
2350 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2351 {
2352         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2353
2354         return gfn_to_hva_memslot_prot(slot, gfn, writable);
2355 }
2356
2357 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2358 {
2359         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2360
2361         return gfn_to_hva_memslot_prot(slot, gfn, writable);
2362 }
2363
2364 static inline int check_user_page_hwpoison(unsigned long addr)
2365 {
2366         int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
2367
2368         rc = get_user_pages(addr, 1, flags, NULL, NULL);
2369         return rc == -EHWPOISON;
2370 }
2371
2372 /*
2373  * The fast path to get the writable pfn which will be stored in @pfn,
2374  * true indicates success, otherwise false is returned.  It's also the
2375  * only part that runs if we can in atomic context.
2376  */
2377 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2378                             bool *writable, kvm_pfn_t *pfn)
2379 {
2380         struct page *page[1];
2381
2382         /*
2383          * Fast pin a writable pfn only if it is a write fault request
2384          * or the caller allows to map a writable pfn for a read fault
2385          * request.
2386          */
2387         if (!(write_fault || writable))
2388                 return false;
2389
2390         if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
2391                 *pfn = page_to_pfn(page[0]);
2392
2393                 if (writable)
2394                         *writable = true;
2395                 return true;
2396         }
2397
2398         return false;
2399 }
2400
2401 /*
2402  * The slow path to get the pfn of the specified host virtual address,
2403  * 1 indicates success, -errno is returned if error is detected.
2404  */
2405 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
2406                            bool *writable, kvm_pfn_t *pfn)
2407 {
2408         unsigned int flags = FOLL_HWPOISON;
2409         struct page *page;
2410         int npages = 0;
2411
2412         might_sleep();
2413
2414         if (writable)
2415                 *writable = write_fault;
2416
2417         if (write_fault)
2418                 flags |= FOLL_WRITE;
2419         if (async)
2420                 flags |= FOLL_NOWAIT;
2421
2422         npages = get_user_pages_unlocked(addr, 1, &page, flags);
2423         if (npages != 1)
2424                 return npages;
2425
2426         /* map read fault as writable if possible */
2427         if (unlikely(!write_fault) && writable) {
2428                 struct page *wpage;
2429
2430                 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2431                         *writable = true;
2432                         put_page(page);
2433                         page = wpage;
2434                 }
2435         }
2436         *pfn = page_to_pfn(page);
2437         return npages;
2438 }
2439
2440 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2441 {
2442         if (unlikely(!(vma->vm_flags & VM_READ)))
2443                 return false;
2444
2445         if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2446                 return false;
2447
2448         return true;
2449 }
2450
2451 static int kvm_try_get_pfn(kvm_pfn_t pfn)
2452 {
2453         if (kvm_is_reserved_pfn(pfn))
2454                 return 1;
2455         return get_page_unless_zero(pfn_to_page(pfn));
2456 }
2457
2458 static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2459                                unsigned long addr, bool write_fault,
2460                                bool *writable, kvm_pfn_t *p_pfn)
2461 {
2462         kvm_pfn_t pfn;
2463         pte_t *ptep;
2464         spinlock_t *ptl;
2465         int r;
2466
2467         r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2468         if (r) {
2469                 /*
2470                  * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2471                  * not call the fault handler, so do it here.
2472                  */
2473                 bool unlocked = false;
2474                 r = fixup_user_fault(current->mm, addr,
2475                                      (write_fault ? FAULT_FLAG_WRITE : 0),
2476                                      &unlocked);
2477                 if (unlocked)
2478                         return -EAGAIN;
2479                 if (r)
2480                         return r;
2481
2482                 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2483                 if (r)
2484                         return r;
2485         }
2486
2487         if (write_fault && !pte_write(*ptep)) {
2488                 pfn = KVM_PFN_ERR_RO_FAULT;
2489                 goto out;
2490         }
2491
2492         if (writable)
2493                 *writable = pte_write(*ptep);
2494         pfn = pte_pfn(*ptep);
2495
2496         /*
2497          * Get a reference here because callers of *hva_to_pfn* and
2498          * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
2499          * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
2500          * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
2501          * simply do nothing for reserved pfns.
2502          *
2503          * Whoever called remap_pfn_range is also going to call e.g.
2504          * unmap_mapping_range before the underlying pages are freed,
2505          * causing a call to our MMU notifier.
2506          *
2507          * Certain IO or PFNMAP mappings can be backed with valid
2508          * struct pages, but be allocated without refcounting e.g.,
2509          * tail pages of non-compound higher order allocations, which
2510          * would then underflow the refcount when the caller does the
2511          * required put_page. Don't allow those pages here.
2512          */ 
2513         if (!kvm_try_get_pfn(pfn))
2514                 r = -EFAULT;
2515
2516 out:
2517         pte_unmap_unlock(ptep, ptl);
2518         *p_pfn = pfn;
2519
2520         return r;
2521 }
2522
2523 /*
2524  * Pin guest page in memory and return its pfn.
2525  * @addr: host virtual address which maps memory to the guest
2526  * @atomic: whether this function can sleep
2527  * @async: whether this function need to wait IO complete if the
2528  *         host page is not in the memory
2529  * @write_fault: whether we should get a writable host page
2530  * @writable: whether it allows to map a writable host page for !@write_fault
2531  *
2532  * The function will map a writable host page for these two cases:
2533  * 1): @write_fault = true
2534  * 2): @write_fault = false && @writable, @writable will tell the caller
2535  *     whether the mapping is writable.
2536  */
2537 kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
2538                      bool write_fault, bool *writable)
2539 {
2540         struct vm_area_struct *vma;
2541         kvm_pfn_t pfn = 0;
2542         int npages, r;
2543
2544         /* we can do it either atomically or asynchronously, not both */
2545         BUG_ON(atomic && async);
2546
2547         if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
2548                 return pfn;
2549
2550         if (atomic)
2551                 return KVM_PFN_ERR_FAULT;
2552
2553         npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
2554         if (npages == 1)
2555                 return pfn;
2556
2557         mmap_read_lock(current->mm);
2558         if (npages == -EHWPOISON ||
2559               (!async && check_user_page_hwpoison(addr))) {
2560                 pfn = KVM_PFN_ERR_HWPOISON;
2561                 goto exit;
2562         }
2563
2564 retry:
2565         vma = vma_lookup(current->mm, addr);
2566
2567         if (vma == NULL)
2568                 pfn = KVM_PFN_ERR_FAULT;
2569         else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
2570                 r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
2571                 if (r == -EAGAIN)
2572                         goto retry;
2573                 if (r < 0)
2574                         pfn = KVM_PFN_ERR_FAULT;
2575         } else {
2576                 if (async && vma_is_valid(vma, write_fault))
2577                         *async = true;
2578                 pfn = KVM_PFN_ERR_FAULT;
2579         }
2580 exit:
2581         mmap_read_unlock(current->mm);
2582         return pfn;
2583 }
2584
2585 kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
2586                                bool atomic, bool *async, bool write_fault,
2587                                bool *writable, hva_t *hva)
2588 {
2589         unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
2590
2591         if (hva)
2592                 *hva = addr;
2593
2594         if (addr == KVM_HVA_ERR_RO_BAD) {
2595                 if (writable)
2596                         *writable = false;
2597                 return KVM_PFN_ERR_RO_FAULT;
2598         }
2599
2600         if (kvm_is_error_hva(addr)) {
2601                 if (writable)
2602                         *writable = false;
2603                 return KVM_PFN_NOSLOT;
2604         }
2605
2606         /* Do not map writable pfn in the readonly memslot. */
2607         if (writable && memslot_is_readonly(slot)) {
2608                 *writable = false;
2609                 writable = NULL;
2610         }
2611
2612         return hva_to_pfn(addr, atomic, async, write_fault,
2613                           writable);
2614 }
2615 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
2616
2617 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
2618                       bool *writable)
2619 {
2620         return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
2621                                     write_fault, writable, NULL);
2622 }
2623 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2624
2625 kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
2626 {
2627         return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
2628 }
2629 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
2630
2631 kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
2632 {
2633         return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
2634 }
2635 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2636
2637 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
2638 {
2639         return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2640 }
2641 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
2642
2643 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
2644 {
2645         return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2646 }
2647 EXPORT_SYMBOL_GPL(gfn_to_pfn);
2648
2649 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2650 {
2651         return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2652 }
2653 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
2654
2655 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2656                             struct page **pages, int nr_pages)
2657 {
2658         unsigned long addr;
2659         gfn_t entry = 0;
2660
2661         addr = gfn_to_hva_many(slot, gfn, &entry);
2662         if (kvm_is_error_hva(addr))
2663                 return -1;
2664
2665         if (entry < nr_pages)
2666                 return 0;
2667
2668         return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
2669 }
2670 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
2671
2672 static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
2673 {
2674         if (is_error_noslot_pfn(pfn))
2675                 return KVM_ERR_PTR_BAD_PAGE;
2676
2677         if (kvm_is_reserved_pfn(pfn)) {
2678                 WARN_ON(1);
2679                 return KVM_ERR_PTR_BAD_PAGE;
2680         }
2681
2682         return pfn_to_page(pfn);
2683 }
2684
2685 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
2686 {
2687         kvm_pfn_t pfn;
2688
2689         pfn = gfn_to_pfn(kvm, gfn);
2690
2691         return kvm_pfn_to_page(pfn);
2692 }
2693 EXPORT_SYMBOL_GPL(gfn_to_page);
2694
2695 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
2696 {
2697         if (pfn == 0)
2698                 return;
2699
2700         if (dirty)
2701                 kvm_release_pfn_dirty(pfn);
2702         else
2703                 kvm_release_pfn_clean(pfn);
2704 }
2705
2706 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
2707 {
2708         kvm_pfn_t pfn;
2709         void *hva = NULL;
2710         struct page *page = KVM_UNMAPPED_PAGE;
2711
2712         if (!map)
2713                 return -EINVAL;
2714
2715         pfn = gfn_to_pfn(vcpu->kvm, gfn);
2716         if (is_error_noslot_pfn(pfn))
2717                 return -EINVAL;
2718
2719         if (pfn_valid(pfn)) {
2720                 page = pfn_to_page(pfn);
2721                 hva = kmap(page);
2722 #ifdef CONFIG_HAS_IOMEM
2723         } else {
2724                 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
2725 #endif
2726         }
2727
2728         if (!hva)
2729                 return -EFAULT;
2730
2731         map->page = page;
2732         map->hva = hva;
2733         map->pfn = pfn;
2734         map->gfn = gfn;
2735
2736         return 0;
2737 }
2738 EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2739
2740 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
2741 {
2742         if (!map)
2743                 return;
2744
2745         if (!map->hva)
2746                 return;
2747
2748         if (map->page != KVM_UNMAPPED_PAGE)
2749                 kunmap(map->page);
2750 #ifdef CONFIG_HAS_IOMEM
2751         else
2752                 memunmap(map->hva);
2753 #endif
2754
2755         if (dirty)
2756                 kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
2757
2758         kvm_release_pfn(map->pfn, dirty);
2759
2760         map->hva = NULL;
2761         map->page = NULL;
2762 }
2763 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2764
2765 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2766 {
2767         kvm_pfn_t pfn;
2768
2769         pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
2770
2771         return kvm_pfn_to_page(pfn);
2772 }
2773 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
2774
2775 void kvm_release_page_clean(struct page *page)
2776 {
2777         WARN_ON(is_error_page(page));
2778
2779         kvm_release_pfn_clean(page_to_pfn(page));
2780 }
2781 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2782
2783 void kvm_release_pfn_clean(kvm_pfn_t pfn)
2784 {
2785         if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
2786                 put_page(pfn_to_page(pfn));
2787 }
2788 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
2789
2790 void kvm_release_page_dirty(struct page *page)
2791 {
2792         WARN_ON(is_error_page(page));
2793
2794         kvm_release_pfn_dirty(page_to_pfn(page));
2795 }
2796 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
2797
2798 void kvm_release_pfn_dirty(kvm_pfn_t pfn)
2799 {
2800         kvm_set_pfn_dirty(pfn);
2801         kvm_release_pfn_clean(pfn);
2802 }
2803 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
2804
2805 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
2806 {
2807         if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2808                 SetPageDirty(pfn_to_page(pfn));
2809 }
2810 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
2811
2812 void kvm_set_pfn_accessed(kvm_pfn_t pfn)
2813 {
2814         if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2815                 mark_page_accessed(pfn_to_page(pfn));
2816 }
2817 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
2818
2819 static int next_segment(unsigned long len, int offset)
2820 {
2821         if (len > PAGE_SIZE - offset)
2822                 return PAGE_SIZE - offset;
2823         else
2824                 return len;
2825 }
2826
2827 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
2828                                  void *data, int offset, int len)
2829 {
2830         int r;
2831         unsigned long addr;
2832
2833         addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2834         if (kvm_is_error_hva(addr))
2835                 return -EFAULT;
2836         r = __copy_from_user(data, (void __user *)addr + offset, len);
2837         if (r)
2838                 return -EFAULT;
2839         return 0;
2840 }
2841
2842 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
2843                         int len)
2844 {
2845         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2846
2847         return __kvm_read_guest_page(slot, gfn, data, offset, len);
2848 }
2849 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
2850
2851 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
2852                              int offset, int len)
2853 {
2854         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2855
2856         return __kvm_read_guest_page(slot, gfn, data, offset, len);
2857 }
2858 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
2859
2860 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
2861 {
2862         gfn_t gfn = gpa >> PAGE_SHIFT;
2863         int seg;
2864         int offset = offset_in_page(gpa);
2865         int ret;
2866
2867         while ((seg = next_segment(len, offset)) != 0) {
2868                 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
2869                 if (ret < 0)
2870                         return ret;
2871                 offset = 0;
2872                 len -= seg;
2873                 data += seg;
2874                 ++gfn;
2875         }
2876         return 0;
2877 }
2878 EXPORT_SYMBOL_GPL(kvm_read_guest);
2879
2880 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
2881 {
2882         gfn_t gfn = gpa >> PAGE_SHIFT;
2883         int seg;
2884         int offset = offset_in_page(gpa);
2885         int ret;
2886
2887         while ((seg = next_segment(len, offset)) != 0) {
2888                 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
2889                 if (ret < 0)
2890                         return ret;
2891                 offset = 0;
2892                 len -= seg;
2893                 data += seg;
2894                 ++gfn;
2895         }
2896         return 0;
2897 }
2898 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
2899
2900 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2901                                    void *data, int offset, unsigned long len)
2902 {
2903         int r;
2904         unsigned long addr;
2905
2906         addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2907         if (kvm_is_error_hva(addr))
2908                 return -EFAULT;
2909         pagefault_disable();
2910         r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
2911         pagefault_enable();
2912         if (r)
2913                 return -EFAULT;
2914         return 0;
2915 }
2916
2917 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
2918                                void *data, unsigned long len)
2919 {
2920         gfn_t gfn = gpa >> PAGE_SHIFT;
2921         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2922         int offset = offset_in_page(gpa);
2923
2924         return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2925 }
2926 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
2927
2928 static int __kvm_write_guest_page(struct kvm *kvm,
2929                                   struct kvm_memory_slot *memslot, gfn_t gfn,
2930                                   const void *data, int offset, int len)
2931 {
2932         int r;
2933         unsigned long addr;
2934
2935         addr = gfn_to_hva_memslot(memslot, gfn);
2936         if (kvm_is_error_hva(addr))
2937                 return -EFAULT;
2938         r = __copy_to_user((void __user *)addr + offset, data, len);
2939         if (r)
2940                 return -EFAULT;
2941         mark_page_dirty_in_slot(kvm, memslot, gfn);
2942         return 0;
2943 }
2944
2945 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
2946                          const void *data, int offset, int len)
2947 {
2948         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2949
2950         return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
2951 }
2952 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
2953
2954 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
2955                               const void *data, int offset, int len)
2956 {
2957         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2958
2959         return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
2960 }
2961 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
2962
2963 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
2964                     unsigned long len)
2965 {
2966         gfn_t gfn = gpa >> PAGE_SHIFT;
2967         int seg;
2968         int offset = offset_in_page(gpa);
2969         int ret;
2970
2971         while ((seg = next_segment(len, offset)) != 0) {
2972                 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
2973                 if (ret < 0)
2974                         return ret;
2975                 offset = 0;
2976                 len -= seg;
2977                 data += seg;
2978                 ++gfn;
2979         }
2980         return 0;
2981 }
2982 EXPORT_SYMBOL_GPL(kvm_write_guest);
2983
2984 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
2985                          unsigned long len)
2986 {
2987         gfn_t gfn = gpa >> PAGE_SHIFT;
2988         int seg;
2989         int offset = offset_in_page(gpa);
2990         int ret;
2991
2992         while ((seg = next_segment(len, offset)) != 0) {
2993                 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
2994                 if (ret < 0)
2995                         return ret;
2996                 offset = 0;
2997                 len -= seg;
2998                 data += seg;
2999                 ++gfn;
3000         }
3001         return 0;
3002 }
3003 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
3004
3005 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
3006                                        struct gfn_to_hva_cache *ghc,
3007                                        gpa_t gpa, unsigned long len)
3008 {
3009         int offset = offset_in_page(gpa);
3010         gfn_t start_gfn = gpa >> PAGE_SHIFT;
3011         gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
3012         gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
3013         gfn_t nr_pages_avail;
3014
3015         /* Update ghc->generation before performing any error checks. */
3016         ghc->generation = slots->generation;
3017
3018         if (start_gfn > end_gfn) {
3019                 ghc->hva = KVM_HVA_ERR_BAD;
3020                 return -EINVAL;
3021         }
3022
3023         /*
3024          * If the requested region crosses two memslots, we still
3025          * verify that the entire region is valid here.
3026          */
3027         for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
3028                 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
3029                 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
3030                                            &nr_pages_avail);
3031                 if (kvm_is_error_hva(ghc->hva))
3032                         return -EFAULT;
3033         }
3034
3035         /* Use the slow path for cross page reads and writes. */
3036         if (nr_pages_needed == 1)
3037                 ghc->hva += offset;
3038         else
3039                 ghc->memslot = NULL;
3040
3041         ghc->gpa = gpa;
3042         ghc->len = len;
3043         return 0;
3044 }
3045
3046 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3047                               gpa_t gpa, unsigned long len)
3048 {
3049         struct kvm_memslots *slots = kvm_memslots(kvm);
3050         return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
3051 }
3052 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
3053
3054 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3055                                   void *data, unsigned int offset,
3056                                   unsigned long len)
3057 {
3058         struct kvm_memslots *slots = kvm_memslots(kvm);
3059         int r;
3060         gpa_t gpa = ghc->gpa + offset;
3061
3062         if (WARN_ON_ONCE(len + offset > ghc->len))
3063                 return -EINVAL;
3064
3065         if (slots->generation != ghc->generation) {
3066                 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3067                         return -EFAULT;
3068         }
3069
3070         if (kvm_is_error_hva(ghc->hva))
3071                 return -EFAULT;
3072
3073         if (unlikely(!ghc->memslot))
3074                 return kvm_write_guest(kvm, gpa, data, len);
3075
3076         r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
3077         if (r)
3078                 return -EFAULT;
3079         mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
3080
3081         return 0;
3082 }
3083 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
3084
3085 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3086                            void *data, unsigned long len)
3087 {
3088         return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
3089 }
3090 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
3091
3092 int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3093                                  void *data, unsigned int offset,
3094                                  unsigned long len)
3095 {
3096         struct kvm_memslots *slots = kvm_memslots(kvm);
3097         int r;
3098         gpa_t gpa = ghc->gpa + offset;
3099
3100         if (WARN_ON_ONCE(len + offset > ghc->len))
3101                 return -EINVAL;
3102
3103         if (slots->generation != ghc->generation) {
3104                 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3105                         return -EFAULT;
3106         }
3107
3108         if (kvm_is_error_hva(ghc->hva))
3109                 return -EFAULT;
3110
3111         if (unlikely(!ghc->memslot))
3112                 return kvm_read_guest(kvm, gpa, data, len);
3113
3114         r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
3115         if (r)
3116                 return -EFAULT;
3117
3118         return 0;
3119 }
3120 EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
3121
3122 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3123                           void *data, unsigned long len)
3124 {
3125         return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
3126 }
3127 EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
3128
3129 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3130 {
3131         const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3132         gfn_t gfn = gpa >> PAGE_SHIFT;
3133         int seg;
3134         int offset = offset_in_page(gpa);
3135         int ret;
3136
3137         while ((seg = next_segment(len, offset)) != 0) {
3138                 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
3139                 if (ret < 0)
3140                         return ret;
3141                 offset = 0;
3142                 len -= seg;
3143                 ++gfn;
3144         }
3145         return 0;
3146 }
3147 EXPORT_SYMBOL_GPL(kvm_clear_guest);
3148
3149 void mark_page_dirty_in_slot(struct kvm *kvm,
3150                              const struct kvm_memory_slot *memslot,
3151                              gfn_t gfn)
3152 {
3153         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
3154
3155 #ifdef CONFIG_HAVE_KVM_DIRTY_RING
3156         if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm))
3157                 return;
3158 #endif
3159
3160         if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
3161                 unsigned long rel_gfn = gfn - memslot->base_gfn;
3162                 u32 slot = (memslot->as_id << 16) | memslot->id;
3163
3164                 if (kvm->dirty_ring_size)
3165                         kvm_dirty_ring_push(&vcpu->dirty_ring,
3166                                             slot, rel_gfn);
3167                 else
3168                         set_bit_le(rel_gfn, memslot->dirty_bitmap);
3169         }
3170 }
3171 EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
3172
3173 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3174 {
3175         struct kvm_memory_slot *memslot;
3176
3177         memslot = gfn_to_memslot(kvm, gfn);
3178         mark_page_dirty_in_slot(kvm, memslot, gfn);
3179 }
3180 EXPORT_SYMBOL_GPL(mark_page_dirty);
3181
3182 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3183 {
3184         struct kvm_memory_slot *memslot;
3185
3186         memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3187         mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
3188 }
3189 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3190
3191 void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3192 {
3193         if (!vcpu->sigset_active)
3194                 return;
3195
3196         /*
3197          * This does a lockless modification of ->real_blocked, which is fine
3198          * because, only current can change ->real_blocked and all readers of
3199          * ->real_blocked don't care as long ->real_blocked is always a subset
3200          * of ->blocked.
3201          */
3202         sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
3203 }
3204
3205 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3206 {
3207         if (!vcpu->sigset_active)
3208                 return;
3209
3210         sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
3211         sigemptyset(&current->real_blocked);
3212 }
3213
3214 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3215 {
3216         unsigned int old, val, grow, grow_start;
3217
3218         old = val = vcpu->halt_poll_ns;
3219         grow_start = READ_ONCE(halt_poll_ns_grow_start);
3220         grow = READ_ONCE(halt_poll_ns_grow);
3221         if (!grow)
3222                 goto out;
3223
3224         val *= grow;
3225         if (val < grow_start)
3226                 val = grow_start;
3227
3228         if (val > vcpu->kvm->max_halt_poll_ns)
3229                 val = vcpu->kvm->max_halt_poll_ns;
3230
3231         vcpu->halt_poll_ns = val;
3232 out:
3233         trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3234 }
3235
3236 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3237 {
3238         unsigned int old, val, shrink, grow_start;
3239
3240         old = val = vcpu->halt_poll_ns;
3241         shrink = READ_ONCE(halt_poll_ns_shrink);
3242         grow_start = READ_ONCE(halt_poll_ns_grow_start);
3243         if (shrink == 0)
3244                 val = 0;
3245         else
3246                 val /= shrink;
3247
3248         if (val < grow_start)
3249                 val = 0;
3250
3251         vcpu->halt_poll_ns = val;
3252         trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3253 }
3254
3255 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3256 {
3257         int ret = -EINTR;
3258         int idx = srcu_read_lock(&vcpu->kvm->srcu);
3259
3260         if (kvm_arch_vcpu_runnable(vcpu)) {
3261                 kvm_make_request(KVM_REQ_UNHALT, vcpu);
3262                 goto out;
3263         }
3264         if (kvm_cpu_has_pending_timer(vcpu))
3265                 goto out;
3266         if (signal_pending(current))
3267                 goto out;
3268         if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3269                 goto out;
3270
3271         ret = 0;
3272 out:
3273         srcu_read_unlock(&vcpu->kvm->srcu, idx);
3274         return ret;
3275 }
3276
3277 /*
3278  * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
3279  * pending.  This is mostly used when halting a vCPU, but may also be used
3280  * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
3281  */
3282 bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
3283 {
3284         struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
3285         bool waited = false;
3286
3287         vcpu->stat.generic.blocking = 1;
3288
3289         kvm_arch_vcpu_blocking(vcpu);
3290
3291         prepare_to_rcuwait(wait);
3292         for (;;) {
3293                 set_current_state(TASK_INTERRUPTIBLE);
3294
3295                 if (kvm_vcpu_check_block(vcpu) < 0)
3296                         break;
3297
3298                 waited = true;
3299                 schedule();
3300         }
3301         finish_rcuwait(wait);
3302
3303         kvm_arch_vcpu_unblocking(vcpu);
3304
3305         vcpu->stat.generic.blocking = 0;
3306
3307         return waited;
3308 }
3309
3310 static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
3311                                           ktime_t end, bool success)
3312 {
3313         struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
3314         u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
3315
3316         ++vcpu->stat.generic.halt_attempted_poll;
3317
3318         if (success) {
3319                 ++vcpu->stat.generic.halt_successful_poll;
3320
3321                 if (!vcpu_valid_wakeup(vcpu))
3322                         ++vcpu->stat.generic.halt_poll_invalid;
3323
3324                 stats->halt_poll_success_ns += poll_ns;
3325                 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
3326         } else {
3327                 stats->halt_poll_fail_ns += poll_ns;
3328                 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
3329         }
3330 }
3331
3332 /*
3333  * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc...  If halt
3334  * polling is enabled, busy wait for a short time before blocking to avoid the
3335  * expensive block+unblock sequence if a wake event arrives soon after the vCPU
3336  * is halted.
3337  */
3338 void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
3339 {
3340         bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
3341         bool do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
3342         ktime_t start, cur, poll_end;
3343         bool waited = false;
3344         u64 halt_ns;
3345
3346         start = cur = poll_end = ktime_get();
3347         if (do_halt_poll) {
3348                 ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
3349
3350                 do {
3351                         /*
3352                          * This sets KVM_REQ_UNHALT if an interrupt
3353                          * arrives.
3354                          */
3355                         if (kvm_vcpu_check_block(vcpu) < 0)
3356                                 goto out;
3357                         cpu_relax();
3358                         poll_end = cur = ktime_get();
3359                 } while (kvm_vcpu_can_poll(cur, stop));
3360         }
3361
3362         waited = kvm_vcpu_block(vcpu);
3363
3364         cur = ktime_get();
3365         if (waited) {
3366                 vcpu->stat.generic.halt_wait_ns +=
3367                         ktime_to_ns(cur) - ktime_to_ns(poll_end);
3368                 KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3369                                 ktime_to_ns(cur) - ktime_to_ns(poll_end));
3370         }
3371 out:
3372         /* The total time the vCPU was "halted", including polling time. */
3373         halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3374
3375         /*
3376          * Note, halt-polling is considered successful so long as the vCPU was
3377          * never actually scheduled out, i.e. even if the wake event arrived
3378          * after of the halt-polling loop itself, but before the full wait.
3379          */
3380         if (do_halt_poll)
3381                 update_halt_poll_stats(vcpu, start, poll_end, !waited);
3382
3383         if (halt_poll_allowed) {
3384                 if (!vcpu_valid_wakeup(vcpu)) {
3385                         shrink_halt_poll_ns(vcpu);
3386                 } else if (vcpu->kvm->max_halt_poll_ns) {
3387                         if (halt_ns <= vcpu->halt_poll_ns)
3388                                 ;
3389                         /* we had a long block, shrink polling */
3390                         else if (vcpu->halt_poll_ns &&
3391                                  halt_ns > vcpu->kvm->max_halt_poll_ns)
3392                                 shrink_halt_poll_ns(vcpu);
3393                         /* we had a short halt and our poll time is too small */
3394                         else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
3395                                  halt_ns < vcpu->kvm->max_halt_poll_ns)
3396                                 grow_halt_poll_ns(vcpu);
3397                 } else {
3398                         vcpu->halt_poll_ns = 0;
3399                 }
3400         }
3401
3402         trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
3403 }
3404 EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
3405
3406 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3407 {
3408         if (__kvm_vcpu_wake_up(vcpu)) {
3409                 WRITE_ONCE(vcpu->ready, true);
3410                 ++vcpu->stat.generic.halt_wakeup;
3411                 return true;
3412         }
3413
3414         return false;
3415 }
3416 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3417
3418 #ifndef CONFIG_S390
3419 /*
3420  * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3421  */
3422 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3423 {
3424         int me, cpu;
3425
3426         if (kvm_vcpu_wake_up(vcpu))
3427                 return;
3428
3429         me = get_cpu();
3430         /*
3431          * The only state change done outside the vcpu mutex is IN_GUEST_MODE
3432          * to EXITING_GUEST_MODE.  Therefore the moderately expensive "should
3433          * kick" check does not need atomic operations if kvm_vcpu_kick is used
3434          * within the vCPU thread itself.
3435          */
3436         if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
3437                 if (vcpu->mode == IN_GUEST_MODE)
3438                         WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
3439                 goto out;
3440         }
3441
3442         /*
3443          * Note, the vCPU could get migrated to a different pCPU at any point
3444          * after kvm_arch_vcpu_should_kick(), which could result in sending an
3445          * IPI to the previous pCPU.  But, that's ok because the purpose of the
3446          * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
3447          * vCPU also requires it to leave IN_GUEST_MODE.
3448          */
3449         if (kvm_arch_vcpu_should_kick(vcpu)) {
3450                 cpu = READ_ONCE(vcpu->cpu);
3451                 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3452                         smp_send_reschedule(cpu);
3453         }
3454 out:
3455         put_cpu();
3456 }
3457 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
3458 #endif /* !CONFIG_S390 */
3459
3460 int kvm_vcpu_yield_to(struct kvm_vcpu *target)
3461 {
3462         struct pid *pid;
3463         struct task_struct *task = NULL;
3464         int ret = 0;
3465
3466         rcu_read_lock();
3467         pid = rcu_dereference(target->pid);
3468         if (pid)
3469                 task = get_pid_task(pid, PIDTYPE_PID);
3470         rcu_read_unlock();
3471         if (!task)
3472                 return ret;
3473         ret = yield_to(task, 1);
3474         put_task_struct(task);
3475
3476         return ret;
3477 }
3478 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3479
3480 /*
3481  * Helper that checks whether a VCPU is eligible for directed yield.
3482  * Most eligible candidate to yield is decided by following heuristics:
3483  *
3484  *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3485  *  (preempted lock holder), indicated by @in_spin_loop.
3486  *  Set at the beginning and cleared at the end of interception/PLE handler.
3487  *
3488  *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3489  *  chance last time (mostly it has become eligible now since we have probably
3490  *  yielded to lockholder in last iteration. This is done by toggling
3491  *  @dy_eligible each time a VCPU checked for eligibility.)
3492  *
3493  *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
3494  *  to preempted lock-holder could result in wrong VCPU selection and CPU
3495  *  burning. Giving priority for a potential lock-holder increases lock
3496  *  progress.
3497  *
3498  *  Since algorithm is based on heuristics, accessing another VCPU data without
3499  *  locking does not harm. It may result in trying to yield to  same VCPU, fail
3500  *  and continue with next VCPU and so on.
3501  */
3502 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
3503 {
3504 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
3505         bool eligible;
3506
3507         eligible = !vcpu->spin_loop.in_spin_loop ||
3508                     vcpu->spin_loop.dy_eligible;
3509
3510         if (vcpu->spin_loop.in_spin_loop)
3511                 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
3512
3513         return eligible;
3514 #else
3515         return true;
3516 #endif
3517 }
3518
3519 /*
3520  * Unlike kvm_arch_vcpu_runnable, this function is called outside
3521  * a vcpu_load/vcpu_put pair.  However, for most architectures
3522  * kvm_arch_vcpu_runnable does not require vcpu_load.
3523  */
3524 bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3525 {
3526         return kvm_arch_vcpu_runnable(vcpu);
3527 }
3528
3529 static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3530 {
3531         if (kvm_arch_dy_runnable(vcpu))
3532                 return true;
3533
3534 #ifdef CONFIG_KVM_ASYNC_PF
3535         if (!list_empty_careful(&vcpu->async_pf.done))
3536                 return true;
3537 #endif
3538
3539         return false;
3540 }
3541
3542 bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3543 {
3544         return false;
3545 }
3546
3547 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
3548 {
3549         struct kvm *kvm = me->kvm;
3550         struct kvm_vcpu *vcpu;
3551         int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
3552         unsigned long i;
3553         int yielded = 0;
3554         int try = 3;
3555         int pass;
3556
3557         kvm_vcpu_set_in_spin_loop(me, true);
3558         /*
3559          * We boost the priority of a VCPU that is runnable but not
3560          * currently running, because it got preempted by something
3561          * else and called schedule in __vcpu_run.  Hopefully that
3562          * VCPU is holding the lock that we need and will release it.
3563          * We approximate round-robin by starting at the last boosted VCPU.
3564          */
3565         for (pass = 0; pass < 2 && !yielded && try; pass++) {
3566                 kvm_for_each_vcpu(i, vcpu, kvm) {
3567                         if (!pass && i <= last_boosted_vcpu) {
3568                                 i = last_boosted_vcpu;
3569                                 continue;
3570                         } else if (pass && i > last_boosted_vcpu)
3571                                 break;
3572                         if (!READ_ONCE(vcpu->ready))
3573                                 continue;
3574                         if (vcpu == me)
3575                                 continue;
3576                         if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
3577                                 continue;
3578                         if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3579                             !kvm_arch_dy_has_pending_interrupt(vcpu) &&
3580                             !kvm_arch_vcpu_in_kernel(vcpu))
3581                                 continue;
3582                         if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3583                                 continue;
3584
3585                         yielded = kvm_vcpu_yield_to(vcpu);
3586                         if (yielded > 0) {
3587                                 kvm->last_boosted_vcpu = i;
3588                                 break;
3589                         } else if (yielded < 0) {
3590                                 try--;
3591                                 if (!try)
3592                                         break;
3593                         }
3594                 }
3595         }
3596         kvm_vcpu_set_in_spin_loop(me, false);
3597
3598         /* Ensure vcpu is not eligible during next spinloop */
3599         kvm_vcpu_set_dy_eligible(me, false);
3600 }
3601 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3602
3603 static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3604 {
3605 #ifdef CONFIG_HAVE_KVM_DIRTY_RING
3606         return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3607             (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3608              kvm->dirty_ring_size / PAGE_SIZE);
3609 #else
3610         return false;
3611 #endif
3612 }
3613
3614 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
3615 {
3616         struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
3617         struct page *page;
3618
3619         if (vmf->pgoff == 0)
3620                 page = virt_to_page(vcpu->run);
3621 #ifdef CONFIG_X86
3622         else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
3623                 page = virt_to_page(vcpu->arch.pio_data);
3624 #endif
3625 #ifdef CONFIG_KVM_MMIO
3626         else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3627                 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
3628 #endif
3629         else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3630                 page = kvm_dirty_ring_get_page(
3631                     &vcpu->dirty_ring,
3632                     vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
3633         else
3634                 return kvm_arch_vcpu_fault(vcpu, vmf);
3635         get_page(page);
3636         vmf->page = page;
3637         return 0;
3638 }
3639
3640 static const struct vm_operations_struct kvm_vcpu_vm_ops = {
3641         .fault = kvm_vcpu_fault,
3642 };
3643
3644 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
3645 {
3646         struct kvm_vcpu *vcpu = file->private_data;
3647         unsigned long pages = vma_pages(vma);
3648
3649         if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3650              kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3651             ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3652                 return -EINVAL;
3653
3654         vma->vm_ops = &kvm_vcpu_vm_ops;
3655         return 0;
3656 }
3657
3658 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
3659 {
3660         struct kvm_vcpu *vcpu = filp->private_data;
3661
3662         kvm_put_kvm(vcpu->kvm);
3663         return 0;
3664 }
3665
3666 static struct file_operations kvm_vcpu_fops = {
3667         .release        = kvm_vcpu_release,
3668         .unlocked_ioctl = kvm_vcpu_ioctl,
3669         .mmap           = kvm_vcpu_mmap,
3670         .llseek         = noop_llseek,
3671         KVM_COMPAT(kvm_vcpu_compat_ioctl),
3672 };
3673
3674 /*
3675  * Allocates an inode for the vcpu.
3676  */
3677 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
3678 {
3679         char name[8 + 1 + ITOA_MAX_LEN + 1];
3680
3681         snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3682         return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
3683 }
3684
3685 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
3686 {
3687 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
3688         struct dentry *debugfs_dentry;
3689         char dir_name[ITOA_MAX_LEN * 2];
3690
3691         if (!debugfs_initialized())
3692                 return;
3693
3694         snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
3695         debugfs_dentry = debugfs_create_dir(dir_name,
3696                                             vcpu->kvm->debugfs_dentry);
3697
3698         kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
3699 #endif
3700 }
3701
3702 /*
3703  * Creates some virtual cpus.  Good luck creating more than one.
3704  */
3705 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
3706 {
3707         int r;
3708         struct kvm_vcpu *vcpu;
3709         struct page *page;
3710
3711         if (id >= KVM_MAX_VCPU_IDS)
3712                 return -EINVAL;
3713
3714         mutex_lock(&kvm->lock);
3715         if (kvm->created_vcpus == KVM_MAX_VCPUS) {
3716                 mutex_unlock(&kvm->lock);
3717                 return -EINVAL;
3718         }
3719
3720         kvm->created_vcpus++;
3721         mutex_unlock(&kvm->lock);
3722
3723         r = kvm_arch_vcpu_precreate(kvm, id);
3724         if (r)
3725                 goto vcpu_decrement;
3726
3727         vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
3728         if (!vcpu) {
3729                 r = -ENOMEM;
3730                 goto vcpu_decrement;
3731         }
3732
3733         BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
3734         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
3735         if (!page) {
3736                 r = -ENOMEM;
3737                 goto vcpu_free;
3738         }
3739         vcpu->run = page_address(page);
3740
3741         kvm_vcpu_init(vcpu, kvm, id);
3742
3743         r = kvm_arch_vcpu_create(vcpu);
3744         if (r)
3745                 goto vcpu_free_run_page;
3746
3747         if (kvm->dirty_ring_size) {
3748                 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
3749                                          id, kvm->dirty_ring_size);
3750                 if (r)
3751                         goto arch_vcpu_destroy;
3752         }
3753
3754         mutex_lock(&kvm->lock);
3755         if (kvm_get_vcpu_by_id(kvm, id)) {
3756                 r = -EEXIST;
3757                 goto unlock_vcpu_destroy;
3758         }
3759
3760         vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
3761         r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT);
3762         BUG_ON(r == -EBUSY);
3763         if (r)
3764                 goto unlock_vcpu_destroy;
3765
3766         /* Fill the stats id string for the vcpu */
3767         snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
3768                  task_pid_nr(current), id);
3769
3770         /* Now it's all set up, let userspace reach it */
3771         kvm_get_kvm(kvm);
3772         r = create_vcpu_fd(vcpu);
3773         if (r < 0) {
3774                 xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx);
3775                 kvm_put_kvm_no_destroy(kvm);
3776                 goto unlock_vcpu_destroy;
3777         }
3778
3779         /*
3780          * Pairs with smp_rmb() in kvm_get_vcpu.  Store the vcpu
3781          * pointer before kvm->online_vcpu's incremented value.
3782          */
3783         smp_wmb();
3784         atomic_inc(&kvm->online_vcpus);
3785
3786         mutex_unlock(&kvm->lock);
3787         kvm_arch_vcpu_postcreate(vcpu);
3788         kvm_create_vcpu_debugfs(vcpu);
3789         return r;
3790
3791 unlock_vcpu_destroy:
3792         mutex_unlock(&kvm->lock);
3793         kvm_dirty_ring_free(&vcpu->dirty_ring);
3794 arch_vcpu_destroy:
3795         kvm_arch_vcpu_destroy(vcpu);
3796 vcpu_free_run_page:
3797         free_page((unsigned long)vcpu->run);
3798 vcpu_free:
3799         kmem_cache_free(kvm_vcpu_cache, vcpu);
3800 vcpu_decrement:
3801         mutex_lock(&kvm->lock);
3802         kvm->created_vcpus--;
3803         mutex_unlock(&kvm->lock);
3804         return r;
3805 }
3806
3807 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
3808 {
3809         if (sigset) {
3810                 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
3811                 vcpu->sigset_active = 1;
3812                 vcpu->sigset = *sigset;
3813         } else
3814                 vcpu->sigset_active = 0;
3815         return 0;
3816 }
3817
3818 static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
3819                               size_t size, loff_t *offset)
3820 {
3821         struct kvm_vcpu *vcpu = file->private_data;
3822
3823         return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
3824                         &kvm_vcpu_stats_desc[0], &vcpu->stat,
3825                         sizeof(vcpu->stat), user_buffer, size, offset);
3826 }
3827
3828 static const struct file_operations kvm_vcpu_stats_fops = {
3829         .read = kvm_vcpu_stats_read,
3830         .llseek = noop_llseek,
3831 };
3832
3833 static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
3834 {
3835         int fd;
3836         struct file *file;
3837         char name[15 + ITOA_MAX_LEN + 1];
3838
3839         snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
3840
3841         fd = get_unused_fd_flags(O_CLOEXEC);
3842         if (fd < 0)
3843                 return fd;
3844
3845         file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
3846         if (IS_ERR(file)) {
3847                 put_unused_fd(fd);
3848                 return PTR_ERR(file);
3849         }
3850         file->f_mode |= FMODE_PREAD;
3851         fd_install(fd, file);
3852
3853         return fd;
3854 }
3855
3856 static long kvm_vcpu_ioctl(struct file *filp,
3857                            unsigned int ioctl, unsigned long arg)
3858 {
3859         struct kvm_vcpu *vcpu = filp->private_data;
3860         void __user *argp = (void __user *)arg;
3861         int r;
3862         struct kvm_fpu *fpu = NULL;
3863         struct kvm_sregs *kvm_sregs = NULL;
3864
3865         if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
3866                 return -EIO;
3867
3868         if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
3869                 return -EINVAL;
3870
3871         /*
3872          * Some architectures have vcpu ioctls that are asynchronous to vcpu
3873          * execution; mutex_lock() would break them.
3874          */
3875         r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
3876         if (r != -ENOIOCTLCMD)
3877                 return r;
3878
3879         if (mutex_lock_killable(&vcpu->mutex))
3880                 return -EINTR;
3881         switch (ioctl) {
3882         case KVM_RUN: {
3883                 struct pid *oldpid;
3884                 r = -EINVAL;
3885                 if (arg)
3886                         goto out;
3887                 oldpid = rcu_access_pointer(vcpu->pid);
3888                 if (unlikely(oldpid != task_pid(current))) {
3889                         /* The thread running this VCPU changed. */
3890                         struct pid *newpid;
3891
3892                         r = kvm_arch_vcpu_run_pid_change(vcpu);
3893                         if (r)
3894                                 break;
3895
3896                         newpid = get_task_pid(current, PIDTYPE_PID);
3897                         rcu_assign_pointer(vcpu->pid, newpid);
3898                         if (oldpid)
3899                                 synchronize_rcu();
3900                         put_pid(oldpid);
3901                 }
3902                 r = kvm_arch_vcpu_ioctl_run(vcpu);
3903                 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
3904                 break;
3905         }
3906         case KVM_GET_REGS: {
3907                 struct kvm_regs *kvm_regs;
3908
3909                 r = -ENOMEM;
3910                 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
3911                 if (!kvm_regs)
3912                         goto out;
3913                 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
3914                 if (r)
3915                         goto out_free1;
3916                 r = -EFAULT;
3917                 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
3918                         goto out_free1;
3919                 r = 0;
3920 out_free1:
3921                 kfree(kvm_regs);
3922                 break;
3923         }
3924         case KVM_SET_REGS: {
3925                 struct kvm_regs *kvm_regs;
3926
3927                 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
3928                 if (IS_ERR(kvm_regs)) {
3929                         r = PTR_ERR(kvm_regs);
3930                         goto out;
3931                 }
3932                 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
3933                 kfree(kvm_regs);
3934                 break;
3935         }
3936         case KVM_GET_SREGS: {
3937                 kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
3938                                     GFP_KERNEL_ACCOUNT);
3939                 r = -ENOMEM;
3940                 if (!kvm_sregs)
3941                         goto out;
3942                 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
3943                 if (r)
3944                         goto out;
3945                 r = -EFAULT;
3946                 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
3947                         goto out;
3948                 r = 0;
3949                 break;
3950         }
3951         case KVM_SET_SREGS: {
3952                 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
3953                 if (IS_ERR(kvm_sregs)) {
3954                         r = PTR_ERR(kvm_sregs);
3955                         kvm_sregs = NULL;
3956                         goto out;
3957                 }
3958                 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
3959                 break;
3960         }
3961         case KVM_GET_MP_STATE: {
3962                 struct kvm_mp_state mp_state;
3963
3964                 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
3965                 if (r)
3966                         goto out;
3967                 r = -EFAULT;
3968                 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
3969                         goto out;
3970                 r = 0;
3971                 break;
3972         }
3973         case KVM_SET_MP_STATE: {
3974                 struct kvm_mp_state mp_state;
3975
3976                 r = -EFAULT;
3977                 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
3978                         goto out;
3979                 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
3980                 break;
3981         }
3982         case KVM_TRANSLATE: {
3983                 struct kvm_translation tr;
3984
3985                 r = -EFAULT;
3986                 if (copy_from_user(&tr, argp, sizeof(tr)))
3987                         goto out;
3988                 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
3989                 if (r)
3990                         goto out;
3991                 r = -EFAULT;
3992                 if (copy_to_user(argp, &tr, sizeof(tr)))
3993                         goto out;
3994                 r = 0;
3995                 break;
3996         }
3997         case KVM_SET_GUEST_DEBUG: {
3998                 struct kvm_guest_debug dbg;
3999
4000                 r = -EFAULT;
4001                 if (copy_from_user(&dbg, argp, sizeof(dbg)))
4002                         goto out;
4003                 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
4004                 break;
4005         }
4006         case KVM_SET_SIGNAL_MASK: {
4007                 struct kvm_signal_mask __user *sigmask_arg = argp;
4008                 struct kvm_signal_mask kvm_sigmask;
4009                 sigset_t sigset, *p;
4010
4011                 p = NULL;
4012                 if (argp) {
4013                         r = -EFAULT;
4014                         if (copy_from_user(&kvm_sigmask, argp,
4015                                            sizeof(kvm_sigmask)))
4016                                 goto out;
4017                         r = -EINVAL;
4018                         if (kvm_sigmask.len != sizeof(sigset))
4019                                 goto out;
4020                         r = -EFAULT;
4021                         if (copy_from_user(&sigset, sigmask_arg->sigset,
4022                                            sizeof(sigset)))
4023                                 goto out;
4024                         p = &sigset;
4025                 }
4026                 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
4027                 break;
4028         }
4029         case KVM_GET_FPU: {
4030                 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
4031                 r = -ENOMEM;
4032                 if (!fpu)
4033                         goto out;
4034                 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
4035                 if (r)
4036                         goto out;
4037                 r = -EFAULT;
4038                 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
4039                         goto out;
4040                 r = 0;
4041                 break;
4042         }
4043         case KVM_SET_FPU: {
4044                 fpu = memdup_user(argp, sizeof(*fpu));
4045                 if (IS_ERR(fpu)) {
4046                         r = PTR_ERR(fpu);
4047                         fpu = NULL;
4048                         goto out;
4049                 }
4050                 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
4051                 break;
4052         }
4053         case KVM_GET_STATS_FD: {
4054                 r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
4055                 break;
4056         }
4057         default:
4058                 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
4059         }
4060 out:
4061         mutex_unlock(&vcpu->mutex);
4062         kfree(fpu);
4063         kfree(kvm_sregs);
4064         return r;
4065 }
4066
4067 #ifdef CONFIG_KVM_COMPAT
4068 static long kvm_vcpu_compat_ioctl(struct file *filp,
4069                                   unsigned int ioctl, unsigned long arg)
4070 {
4071         struct kvm_vcpu *vcpu = filp->private_data;
4072         void __user *argp = compat_ptr(arg);
4073         int r;
4074
4075         if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
4076                 return -EIO;
4077
4078         switch (ioctl) {
4079         case KVM_SET_SIGNAL_MASK: {
4080                 struct kvm_signal_mask __user *sigmask_arg = argp;
4081                 struct kvm_signal_mask kvm_sigmask;
4082                 sigset_t sigset;
4083
4084                 if (argp) {
4085                         r = -EFAULT;
4086                         if (copy_from_user(&kvm_sigmask, argp,
4087                                            sizeof(kvm_sigmask)))
4088                                 goto out;
4089                         r = -EINVAL;
4090                         if (kvm_sigmask.len != sizeof(compat_sigset_t))
4091                                 goto out;
4092                         r = -EFAULT;
4093                         if (get_compat_sigset(&sigset,
4094                                               (compat_sigset_t __user *)sigmask_arg->sigset))
4095                                 goto out;
4096                         r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
4097                 } else
4098                         r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
4099                 break;
4100         }
4101         default:
4102                 r = kvm_vcpu_ioctl(filp, ioctl, arg);
4103         }
4104
4105 out:
4106         return r;
4107 }
4108 #endif
4109
4110 static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
4111 {
4112         struct kvm_device *dev = filp->private_data;
4113
4114         if (dev->ops->mmap)
4115                 return dev->ops->mmap(dev, vma);
4116
4117         return -ENODEV;
4118 }
4119
4120 static int kvm_device_ioctl_attr(struct kvm_device *dev,
4121                                  int (*accessor)(struct kvm_device *dev,
4122                                                  struct kvm_device_attr *attr),
4123                                  unsigned long arg)
4124 {
4125         struct kvm_device_attr attr;
4126
4127         if (!accessor)
4128                 return -EPERM;
4129
4130         if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4131                 return -EFAULT;
4132
4133         return accessor(dev, &attr);
4134 }
4135
4136 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4137                              unsigned long arg)
4138 {
4139         struct kvm_device *dev = filp->private_data;
4140
4141         if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
4142                 return -EIO;
4143
4144         switch (ioctl) {
4145         case KVM_SET_DEVICE_ATTR:
4146                 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4147         case KVM_GET_DEVICE_ATTR:
4148                 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4149         case KVM_HAS_DEVICE_ATTR:
4150                 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4151         default:
4152                 if (dev->ops->ioctl)
4153                         return dev->ops->ioctl(dev, ioctl, arg);
4154
4155                 return -ENOTTY;
4156         }
4157 }
4158
4159 static int kvm_device_release(struct inode *inode, struct file *filp)
4160 {
4161         struct kvm_device *dev = filp->private_data;
4162         struct kvm *kvm = dev->kvm;
4163
4164         if (dev->ops->release) {
4165                 mutex_lock(&kvm->lock);
4166                 list_del(&dev->vm_node);
4167                 dev->ops->release(dev);
4168                 mutex_unlock(&kvm->lock);
4169         }
4170
4171         kvm_put_kvm(kvm);
4172         return 0;
4173 }
4174
4175 static const struct file_operations kvm_device_fops = {
4176         .unlocked_ioctl = kvm_device_ioctl,
4177         .release = kvm_device_release,
4178         KVM_COMPAT(kvm_device_ioctl),
4179         .mmap = kvm_device_mmap,
4180 };
4181
4182 struct kvm_device *kvm_device_from_filp(struct file *filp)
4183 {
4184         if (filp->f_op != &kvm_device_fops)
4185                 return NULL;
4186
4187         return filp->private_data;
4188 }
4189
4190 static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
4191 #ifdef CONFIG_KVM_MPIC
4192         [KVM_DEV_TYPE_FSL_MPIC_20]      = &kvm_mpic_ops,
4193         [KVM_DEV_TYPE_FSL_MPIC_42]      = &kvm_mpic_ops,
4194 #endif
4195 };
4196
4197 int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
4198 {
4199         if (type >= ARRAY_SIZE(kvm_device_ops_table))
4200                 return -ENOSPC;
4201
4202         if (kvm_device_ops_table[type] != NULL)
4203                 return -EEXIST;
4204
4205         kvm_device_ops_table[type] = ops;
4206         return 0;
4207 }
4208
4209 void kvm_unregister_device_ops(u32 type)
4210 {
4211         if (kvm_device_ops_table[type] != NULL)
4212                 kvm_device_ops_table[type] = NULL;
4213 }
4214
4215 static int kvm_ioctl_create_device(struct kvm *kvm,
4216                                    struct kvm_create_device *cd)
4217 {
4218         const struct kvm_device_ops *ops = NULL;
4219         struct kvm_device *dev;
4220         bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
4221         int type;
4222         int ret;
4223
4224         if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4225                 return -ENODEV;
4226
4227         type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4228         ops = kvm_device_ops_table[type];
4229         if (ops == NULL)
4230                 return -ENODEV;
4231
4232         if (test)
4233                 return 0;
4234
4235         dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
4236         if (!dev)
4237                 return -ENOMEM;
4238
4239         dev->ops = ops;
4240         dev->kvm = kvm;
4241
4242         mutex_lock(&kvm->lock);
4243         ret = ops->create(dev, type);
4244         if (ret < 0) {
4245                 mutex_unlock(&kvm->lock);
4246                 kfree(dev);
4247                 return ret;
4248         }
4249         list_add(&dev->vm_node, &kvm->devices);
4250         mutex_unlock(&kvm->lock);
4251
4252         if (ops->init)
4253                 ops->init(dev);
4254
4255         kvm_get_kvm(kvm);
4256         ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
4257         if (ret < 0) {
4258                 kvm_put_kvm_no_destroy(kvm);
4259                 mutex_lock(&kvm->lock);
4260                 list_del(&dev->vm_node);
4261                 mutex_unlock(&kvm->lock);
4262                 ops->destroy(dev);
4263                 return ret;
4264         }
4265
4266         cd->fd = ret;
4267         return 0;
4268 }
4269
4270 static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4271 {
4272         switch (arg) {
4273         case KVM_CAP_USER_MEMORY:
4274         case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4275         case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
4276         case KVM_CAP_INTERNAL_ERROR_DATA:
4277 #ifdef CONFIG_HAVE_KVM_MSI
4278         case KVM_CAP_SIGNAL_MSI:
4279 #endif
4280 #ifdef CONFIG_HAVE_KVM_IRQFD
4281         case KVM_CAP_IRQFD:
4282         case KVM_CAP_IRQFD_RESAMPLE:
4283 #endif
4284         case KVM_CAP_IOEVENTFD_ANY_LENGTH:
4285         case KVM_CAP_CHECK_EXTENSION_VM:
4286         case KVM_CAP_ENABLE_CAP_VM:
4287         case KVM_CAP_HALT_POLL:
4288                 return 1;
4289 #ifdef CONFIG_KVM_MMIO
4290         case KVM_CAP_COALESCED_MMIO:
4291                 return KVM_COALESCED_MMIO_PAGE_OFFSET;
4292         case KVM_CAP_COALESCED_PIO:
4293                 return 1;
4294 #endif
4295 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4296         case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4297                 return KVM_DIRTY_LOG_MANUAL_CAPS;
4298 #endif
4299 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4300         case KVM_CAP_IRQ_ROUTING:
4301                 return KVM_MAX_IRQ_ROUTES;
4302 #endif
4303 #if KVM_ADDRESS_SPACE_NUM > 1
4304         case KVM_CAP_MULTI_ADDRESS_SPACE:
4305                 return KVM_ADDRESS_SPACE_NUM;
4306 #endif
4307         case KVM_CAP_NR_MEMSLOTS:
4308                 return KVM_USER_MEM_SLOTS;
4309         case KVM_CAP_DIRTY_LOG_RING:
4310 #ifdef CONFIG_HAVE_KVM_DIRTY_RING
4311                 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4312 #else
4313                 return 0;
4314 #endif
4315         case KVM_CAP_BINARY_STATS_FD:
4316                 return 1;
4317         default:
4318                 break;
4319         }
4320         return kvm_vm_ioctl_check_extension(kvm, arg);
4321 }
4322
4323 static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4324 {
4325         int r;
4326
4327         if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4328                 return -EINVAL;
4329
4330         /* the size should be power of 2 */
4331         if (!size || (size & (size - 1)))
4332                 return -EINVAL;
4333
4334         /* Should be bigger to keep the reserved entries, or a page */
4335         if (size < kvm_dirty_ring_get_rsvd_entries() *
4336             sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4337                 return -EINVAL;
4338
4339         if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4340             sizeof(struct kvm_dirty_gfn))
4341                 return -E2BIG;
4342
4343         /* We only allow it to set once */
4344         if (kvm->dirty_ring_size)
4345                 return -EINVAL;
4346
4347         mutex_lock(&kvm->lock);
4348
4349         if (kvm->created_vcpus) {
4350                 /* We don't allow to change this value after vcpu created */
4351                 r = -EINVAL;
4352         } else {
4353                 kvm->dirty_ring_size = size;
4354                 r = 0;
4355         }
4356
4357         mutex_unlock(&kvm->lock);
4358         return r;
4359 }
4360
4361 static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4362 {
4363         unsigned long i;
4364         struct kvm_vcpu *vcpu;
4365         int cleared = 0;
4366
4367         if (!kvm->dirty_ring_size)
4368                 return -EINVAL;
4369
4370         mutex_lock(&kvm->slots_lock);
4371
4372         kvm_for_each_vcpu(i, vcpu, kvm)
4373                 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4374
4375         mutex_unlock(&kvm->slots_lock);
4376
4377         if (cleared)
4378                 kvm_flush_remote_tlbs(kvm);
4379
4380         return cleared;
4381 }
4382
4383 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4384                                                   struct kvm_enable_cap *cap)
4385 {
4386         return -EINVAL;
4387 }
4388
4389 static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4390                                            struct kvm_enable_cap *cap)
4391 {
4392         switch (cap->cap) {
4393 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4394         case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4395                 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4396
4397                 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4398                         allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4399
4400                 if (cap->flags || (cap->args[0] & ~allowed_options))
4401                         return -EINVAL;
4402                 kvm->manual_dirty_log_protect = cap->args[0];
4403                 return 0;
4404         }
4405 #endif
4406         case KVM_CAP_HALT_POLL: {
4407                 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4408                         return -EINVAL;
4409
4410                 kvm->max_halt_poll_ns = cap->args[0];
4411                 return 0;
4412         }
4413         case KVM_CAP_DIRTY_LOG_RING:
4414                 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
4415         default:
4416                 return kvm_vm_ioctl_enable_cap(kvm, cap);
4417         }
4418 }
4419
4420 static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
4421                               size_t size, loff_t *offset)
4422 {
4423         struct kvm *kvm = file->private_data;
4424
4425         return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
4426                                 &kvm_vm_stats_desc[0], &kvm->stat,
4427                                 sizeof(kvm->stat), user_buffer, size, offset);
4428 }
4429
4430 static const struct file_operations kvm_vm_stats_fops = {
4431         .read = kvm_vm_stats_read,
4432         .llseek = noop_llseek,
4433 };
4434
4435 static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
4436 {
4437         int fd;
4438         struct file *file;
4439
4440         fd = get_unused_fd_flags(O_CLOEXEC);
4441         if (fd < 0)
4442                 return fd;
4443
4444         file = anon_inode_getfile("kvm-vm-stats",
4445                         &kvm_vm_stats_fops, kvm, O_RDONLY);
4446         if (IS_ERR(file)) {
4447                 put_unused_fd(fd);
4448                 return PTR_ERR(file);
4449         }
4450         file->f_mode |= FMODE_PREAD;
4451         fd_install(fd, file);
4452
4453         return fd;
4454 }
4455
4456 static long kvm_vm_ioctl(struct file *filp,
4457                            unsigned int ioctl, unsigned long arg)
4458 {
4459         struct kvm *kvm = filp->private_data;
4460         void __user *argp = (void __user *)arg;
4461         int r;
4462
4463         if (kvm->mm != current->mm || kvm->vm_dead)
4464                 return -EIO;
4465         switch (ioctl) {
4466         case KVM_CREATE_VCPU:
4467                 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
4468                 break;
4469         case KVM_ENABLE_CAP: {
4470                 struct kvm_enable_cap cap;
4471
4472                 r = -EFAULT;
4473                 if (copy_from_user(&cap, argp, sizeof(cap)))
4474                         goto out;
4475                 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
4476                 break;
4477         }
4478         case KVM_SET_USER_MEMORY_REGION: {
4479                 struct kvm_userspace_memory_region kvm_userspace_mem;
4480
4481                 r = -EFAULT;
4482                 if (copy_from_user(&kvm_userspace_mem, argp,
4483                                                 sizeof(kvm_userspace_mem)))
4484                         goto out;
4485
4486                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
4487                 break;
4488         }
4489         case KVM_GET_DIRTY_LOG: {
4490                 struct kvm_dirty_log log;
4491
4492                 r = -EFAULT;
4493                 if (copy_from_user(&log, argp, sizeof(log)))
4494                         goto out;
4495                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4496                 break;
4497         }
4498 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4499         case KVM_CLEAR_DIRTY_LOG: {
4500                 struct kvm_clear_dirty_log log;
4501
4502                 r = -EFAULT;
4503                 if (copy_from_user(&log, argp, sizeof(log)))
4504                         goto out;
4505                 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4506                 break;
4507         }
4508 #endif
4509 #ifdef CONFIG_KVM_MMIO
4510         case KVM_REGISTER_COALESCED_MMIO: {
4511                 struct kvm_coalesced_mmio_zone zone;
4512
4513                 r = -EFAULT;
4514                 if (copy_from_user(&zone, argp, sizeof(zone)))
4515                         goto out;
4516                 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
4517                 break;
4518         }
4519         case KVM_UNREGISTER_COALESCED_MMIO: {
4520                 struct kvm_coalesced_mmio_zone zone;
4521
4522                 r = -EFAULT;
4523                 if (copy_from_user(&zone, argp, sizeof(zone)))
4524                         goto out;
4525                 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
4526                 break;
4527         }
4528 #endif
4529         case KVM_IRQFD: {
4530                 struct kvm_irqfd data;
4531
4532                 r = -EFAULT;
4533                 if (copy_from_user(&data, argp, sizeof(data)))
4534                         goto out;
4535                 r = kvm_irqfd(kvm, &data);
4536                 break;
4537         }
4538         case KVM_IOEVENTFD: {
4539                 struct kvm_ioeventfd data;
4540
4541                 r = -EFAULT;
4542                 if (copy_from_user(&data, argp, sizeof(data)))
4543                         goto out;
4544                 r = kvm_ioeventfd(kvm, &data);
4545                 break;
4546         }
4547 #ifdef CONFIG_HAVE_KVM_MSI
4548         case KVM_SIGNAL_MSI: {
4549                 struct kvm_msi msi;
4550
4551                 r = -EFAULT;
4552                 if (copy_from_user(&msi, argp, sizeof(msi)))
4553                         goto out;
4554                 r = kvm_send_userspace_msi(kvm, &msi);
4555                 break;
4556         }
4557 #endif
4558 #ifdef __KVM_HAVE_IRQ_LINE
4559         case KVM_IRQ_LINE_STATUS:
4560         case KVM_IRQ_LINE: {
4561                 struct kvm_irq_level irq_event;
4562
4563                 r = -EFAULT;
4564                 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
4565                         goto out;
4566
4567                 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
4568                                         ioctl == KVM_IRQ_LINE_STATUS);
4569                 if (r)
4570                         goto out;
4571
4572                 r = -EFAULT;
4573                 if (ioctl == KVM_IRQ_LINE_STATUS) {
4574                         if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
4575                                 goto out;
4576                 }
4577
4578                 r = 0;
4579                 break;
4580         }
4581 #endif
4582 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4583         case KVM_SET_GSI_ROUTING: {
4584                 struct kvm_irq_routing routing;
4585                 struct kvm_irq_routing __user *urouting;
4586                 struct kvm_irq_routing_entry *entries = NULL;
4587
4588                 r = -EFAULT;
4589                 if (copy_from_user(&routing, argp, sizeof(routing)))
4590                         goto out;
4591                 r = -EINVAL;
4592                 if (!kvm_arch_can_set_irq_routing(kvm))
4593                         goto out;
4594                 if (routing.nr > KVM_MAX_IRQ_ROUTES)
4595                         goto out;
4596                 if (routing.flags)
4597                         goto out;
4598                 if (routing.nr) {
4599                         urouting = argp;
4600                         entries = vmemdup_user(urouting->entries,
4601                                                array_size(sizeof(*entries),
4602                                                           routing.nr));
4603                         if (IS_ERR(entries)) {
4604                                 r = PTR_ERR(entries);
4605                                 goto out;
4606                         }
4607                 }
4608                 r = kvm_set_irq_routing(kvm, entries, routing.nr,
4609                                         routing.flags);
4610                 kvfree(entries);
4611                 break;
4612         }
4613 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
4614         case KVM_CREATE_DEVICE: {
4615                 struct kvm_create_device cd;
4616
4617                 r = -EFAULT;
4618                 if (copy_from_user(&cd, argp, sizeof(cd)))
4619                         goto out;
4620
4621                 r = kvm_ioctl_create_device(kvm, &cd);
4622                 if (r)
4623                         goto out;
4624
4625                 r = -EFAULT;
4626                 if (copy_to_user(argp, &cd, sizeof(cd)))
4627                         goto out;
4628
4629                 r = 0;
4630                 break;
4631         }
4632         case KVM_CHECK_EXTENSION:
4633                 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
4634                 break;
4635         case KVM_RESET_DIRTY_RINGS:
4636                 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
4637                 break;
4638         case KVM_GET_STATS_FD:
4639                 r = kvm_vm_ioctl_get_stats_fd(kvm);
4640                 break;
4641         default:
4642                 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
4643         }
4644 out:
4645         return r;
4646 }
4647
4648 #ifdef CONFIG_KVM_COMPAT
4649 struct compat_kvm_dirty_log {
4650         __u32 slot;
4651         __u32 padding1;
4652         union {
4653                 compat_uptr_t dirty_bitmap; /* one bit per page */
4654                 __u64 padding2;
4655         };
4656 };
4657
4658 struct compat_kvm_clear_dirty_log {
4659         __u32 slot;
4660         __u32 num_pages;
4661         __u64 first_page;
4662         union {
4663                 compat_uptr_t dirty_bitmap; /* one bit per page */
4664                 __u64 padding2;
4665         };
4666 };
4667
4668 static long kvm_vm_compat_ioctl(struct file *filp,
4669                            unsigned int ioctl, unsigned long arg)
4670 {
4671         struct kvm *kvm = filp->private_data;
4672         int r;
4673
4674         if (kvm->mm != current->mm || kvm->vm_dead)
4675                 return -EIO;
4676         switch (ioctl) {
4677 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4678         case KVM_CLEAR_DIRTY_LOG: {
4679                 struct compat_kvm_clear_dirty_log compat_log;
4680                 struct kvm_clear_dirty_log log;
4681
4682                 if (copy_from_user(&compat_log, (void __user *)arg,
4683                                    sizeof(compat_log)))
4684                         return -EFAULT;
4685                 log.slot         = compat_log.slot;
4686                 log.num_pages    = compat_log.num_pages;
4687                 log.first_page   = compat_log.first_page;
4688                 log.padding2     = compat_log.padding2;
4689                 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4690
4691                 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4692                 break;
4693         }
4694 #endif
4695         case KVM_GET_DIRTY_LOG: {
4696                 struct compat_kvm_dirty_log compat_log;
4697                 struct kvm_dirty_log log;
4698
4699                 if (copy_from_user(&compat_log, (void __user *)arg,
4700                                    sizeof(compat_log)))
4701                         return -EFAULT;
4702                 log.slot         = compat_log.slot;
4703                 log.padding1     = compat_log.padding1;
4704                 log.padding2     = compat_log.padding2;
4705                 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4706
4707                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4708                 break;
4709         }
4710         default:
4711                 r = kvm_vm_ioctl(filp, ioctl, arg);
4712         }
4713         return r;
4714 }
4715 #endif
4716
4717 static struct file_operations kvm_vm_fops = {
4718         .release        = kvm_vm_release,
4719         .unlocked_ioctl = kvm_vm_ioctl,
4720         .llseek         = noop_llseek,
4721         KVM_COMPAT(kvm_vm_compat_ioctl),
4722 };
4723
4724 bool file_is_kvm(struct file *file)
4725 {
4726         return file && file->f_op == &kvm_vm_fops;
4727 }
4728 EXPORT_SYMBOL_GPL(file_is_kvm);
4729
4730 static int kvm_dev_ioctl_create_vm(unsigned long type)
4731 {
4732         int r;
4733         struct kvm *kvm;
4734         struct file *file;
4735
4736         kvm = kvm_create_vm(type);
4737         if (IS_ERR(kvm))
4738                 return PTR_ERR(kvm);
4739 #ifdef CONFIG_KVM_MMIO
4740         r = kvm_coalesced_mmio_init(kvm);
4741         if (r < 0)
4742                 goto put_kvm;
4743 #endif
4744         r = get_unused_fd_flags(O_CLOEXEC);
4745         if (r < 0)
4746                 goto put_kvm;
4747
4748         snprintf(kvm->stats_id, sizeof(kvm->stats_id),
4749                         "kvm-%d", task_pid_nr(current));
4750
4751         file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
4752         if (IS_ERR(file)) {
4753                 put_unused_fd(r);
4754                 r = PTR_ERR(file);
4755                 goto put_kvm;
4756         }
4757
4758         /*
4759          * Don't call kvm_put_kvm anymore at this point; file->f_op is
4760          * already set, with ->release() being kvm_vm_release().  In error
4761          * cases it will be called by the final fput(file) and will take
4762          * care of doing kvm_put_kvm(kvm).
4763          */
4764         if (kvm_create_vm_debugfs(kvm, r) < 0) {
4765                 put_unused_fd(r);
4766                 fput(file);
4767                 return -ENOMEM;
4768         }
4769         kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
4770
4771         fd_install(r, file);
4772         return r;
4773
4774 put_kvm:
4775         kvm_put_kvm(kvm);
4776         return r;
4777 }
4778
4779 static long kvm_dev_ioctl(struct file *filp,
4780                           unsigned int ioctl, unsigned long arg)
4781 {
4782         long r = -EINVAL;
4783
4784         switch (ioctl) {
4785         case KVM_GET_API_VERSION:
4786                 if (arg)
4787                         goto out;
4788                 r = KVM_API_VERSION;
4789                 break;
4790         case KVM_CREATE_VM:
4791                 r = kvm_dev_ioctl_create_vm(arg);
4792                 break;
4793         case KVM_CHECK_EXTENSION:
4794                 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
4795                 break;
4796         case KVM_GET_VCPU_MMAP_SIZE:
4797                 if (arg)
4798                         goto out;
4799                 r = PAGE_SIZE;     /* struct kvm_run */
4800 #ifdef CONFIG_X86
4801                 r += PAGE_SIZE;    /* pio data page */
4802 #endif
4803 #ifdef CONFIG_KVM_MMIO
4804                 r += PAGE_SIZE;    /* coalesced mmio ring page */
4805 #endif
4806                 break;
4807         case KVM_TRACE_ENABLE:
4808         case KVM_TRACE_PAUSE:
4809         case KVM_TRACE_DISABLE:
4810                 r = -EOPNOTSUPP;
4811                 break;
4812         default:
4813                 return kvm_arch_dev_ioctl(filp, ioctl, arg);
4814         }
4815 out:
4816         return r;
4817 }
4818
4819 static struct file_operations kvm_chardev_ops = {
4820         .unlocked_ioctl = kvm_dev_ioctl,
4821         .llseek         = noop_llseek,
4822         KVM_COMPAT(kvm_dev_ioctl),
4823 };
4824
4825 static struct miscdevice kvm_dev = {
4826         KVM_MINOR,
4827         "kvm",
4828         &kvm_chardev_ops,
4829 };
4830
4831 static void hardware_enable_nolock(void *junk)
4832 {
4833         int cpu = raw_smp_processor_id();
4834         int r;
4835
4836         if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
4837                 return;
4838
4839         cpumask_set_cpu(cpu, cpus_hardware_enabled);
4840
4841         r = kvm_arch_hardware_enable();
4842
4843         if (r) {
4844                 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4845                 atomic_inc(&hardware_enable_failed);
4846                 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
4847         }
4848 }
4849
4850 static int kvm_starting_cpu(unsigned int cpu)
4851 {
4852         raw_spin_lock(&kvm_count_lock);
4853         if (kvm_usage_count)
4854                 hardware_enable_nolock(NULL);
4855         raw_spin_unlock(&kvm_count_lock);
4856         return 0;
4857 }
4858
4859 static void hardware_disable_nolock(void *junk)
4860 {
4861         int cpu = raw_smp_processor_id();
4862
4863         if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
4864                 return;
4865         cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4866         kvm_arch_hardware_disable();
4867 }
4868
4869 static int kvm_dying_cpu(unsigned int cpu)
4870 {
4871         raw_spin_lock(&kvm_count_lock);
4872         if (kvm_usage_count)
4873                 hardware_disable_nolock(NULL);
4874         raw_spin_unlock(&kvm_count_lock);
4875         return 0;
4876 }
4877
4878 static void hardware_disable_all_nolock(void)
4879 {
4880         BUG_ON(!kvm_usage_count);
4881
4882         kvm_usage_count--;
4883         if (!kvm_usage_count)
4884                 on_each_cpu(hardware_disable_nolock, NULL, 1);
4885 }
4886
4887 static void hardware_disable_all(void)
4888 {
4889         raw_spin_lock(&kvm_count_lock);
4890         hardware_disable_all_nolock();
4891         raw_spin_unlock(&kvm_count_lock);
4892 }
4893
4894 static int hardware_enable_all(void)
4895 {
4896         int r = 0;
4897
4898         raw_spin_lock(&kvm_count_lock);
4899
4900         kvm_usage_count++;
4901         if (kvm_usage_count == 1) {
4902                 atomic_set(&hardware_enable_failed, 0);
4903                 on_each_cpu(hardware_enable_nolock, NULL, 1);
4904
4905                 if (atomic_read(&hardware_enable_failed)) {
4906                         hardware_disable_all_nolock();
4907                         r = -EBUSY;
4908                 }
4909         }
4910
4911         raw_spin_unlock(&kvm_count_lock);
4912
4913         return r;
4914 }
4915
4916 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
4917                       void *v)
4918 {
4919         /*
4920          * Some (well, at least mine) BIOSes hang on reboot if
4921          * in vmx root mode.
4922          *
4923          * And Intel TXT required VMX off for all cpu when system shutdown.
4924          */
4925         pr_info("kvm: exiting hardware virtualization\n");
4926         kvm_rebooting = true;
4927         on_each_cpu(hardware_disable_nolock, NULL, 1);
4928         return NOTIFY_OK;
4929 }
4930
4931 static struct notifier_block kvm_reboot_notifier = {
4932         .notifier_call = kvm_reboot,
4933         .priority = 0,
4934 };
4935
4936 static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
4937 {
4938         int i;
4939
4940         for (i = 0; i < bus->dev_count; i++) {
4941                 struct kvm_io_device *pos = bus->range[i].dev;
4942
4943                 kvm_iodevice_destructor(pos);
4944         }
4945         kfree(bus);
4946 }
4947
4948 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
4949                                  const struct kvm_io_range *r2)
4950 {
4951         gpa_t addr1 = r1->addr;
4952         gpa_t addr2 = r2->addr;
4953
4954         if (addr1 < addr2)
4955                 return -1;
4956
4957         /* If r2->len == 0, match the exact address.  If r2->len != 0,
4958          * accept any overlapping write.  Any order is acceptable for
4959          * overlapping ranges, because kvm_io_bus_get_first_dev ensures
4960          * we process all of them.
4961          */
4962         if (r2->len) {
4963                 addr1 += r1->len;
4964                 addr2 += r2->len;
4965         }
4966
4967         if (addr1 > addr2)
4968                 return 1;
4969
4970         return 0;
4971 }
4972
4973 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
4974 {
4975         return kvm_io_bus_cmp(p1, p2);
4976 }
4977
4978 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
4979                              gpa_t addr, int len)
4980 {
4981         struct kvm_io_range *range, key;
4982         int off;
4983
4984         key = (struct kvm_io_range) {
4985                 .addr = addr,
4986                 .len = len,
4987         };
4988
4989         range = bsearch(&key, bus->range, bus->dev_count,
4990                         sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
4991         if (range == NULL)
4992                 return -ENOENT;
4993
4994         off = range - bus->range;
4995
4996         while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
4997                 off--;
4998
4999         return off;
5000 }
5001
5002 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5003                               struct kvm_io_range *range, const void *val)
5004 {
5005         int idx;
5006
5007         idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5008         if (idx < 0)
5009                 return -EOPNOTSUPP;
5010
5011         while (idx < bus->dev_count &&
5012                 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5013                 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
5014                                         range->len, val))
5015                         return idx;
5016                 idx++;
5017         }
5018
5019         return -EOPNOTSUPP;
5020 }
5021
5022 /* kvm_io_bus_write - called under kvm->slots_lock */
5023 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5024                      int len, const void *val)
5025 {
5026         struct kvm_io_bus *bus;
5027         struct kvm_io_range range;
5028         int r;
5029
5030         range = (struct kvm_io_range) {
5031                 .addr = addr,
5032                 .len = len,
5033         };
5034
5035         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5036         if (!bus)
5037                 return -ENOMEM;
5038         r = __kvm_io_bus_write(vcpu, bus, &range, val);
5039         return r < 0 ? r : 0;
5040 }
5041 EXPORT_SYMBOL_GPL(kvm_io_bus_write);
5042
5043 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
5044 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
5045                             gpa_t addr, int len, const void *val, long cookie)
5046 {
5047         struct kvm_io_bus *bus;
5048         struct kvm_io_range range;
5049
5050         range = (struct kvm_io_range) {
5051                 .addr = addr,
5052                 .len = len,
5053         };
5054
5055         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5056         if (!bus)
5057                 return -ENOMEM;
5058
5059         /* First try the device referenced by cookie. */
5060         if ((cookie >= 0) && (cookie < bus->dev_count) &&
5061             (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
5062                 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
5063                                         val))
5064                         return cookie;
5065
5066         /*
5067          * cookie contained garbage; fall back to search and return the
5068          * correct cookie value.
5069          */
5070         return __kvm_io_bus_write(vcpu, bus, &range, val);
5071 }
5072
5073 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5074                              struct kvm_io_range *range, void *val)
5075 {
5076         int idx;
5077
5078         idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5079         if (idx < 0)
5080                 return -EOPNOTSUPP;
5081
5082         while (idx < bus->dev_count &&
5083                 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5084                 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
5085                                        range->len, val))
5086                         return idx;
5087                 idx++;
5088         }
5089
5090         return -EOPNOTSUPP;
5091 }
5092
5093 /* kvm_io_bus_read - called under kvm->slots_lock */
5094 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5095                     int len, void *val)
5096 {
5097         struct kvm_io_bus *bus;
5098         struct kvm_io_range range;
5099         int r;
5100
5101         range = (struct kvm_io_range) {
5102                 .addr = addr,
5103                 .len = len,
5104         };
5105
5106         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5107         if (!bus)
5108                 return -ENOMEM;
5109         r = __kvm_io_bus_read(vcpu, bus, &range, val);
5110         return r < 0 ? r : 0;
5111 }
5112
5113 /* Caller must hold slots_lock. */
5114 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
5115                             int len, struct kvm_io_device *dev)
5116 {
5117         int i;
5118         struct kvm_io_bus *new_bus, *bus;
5119         struct kvm_io_range range;
5120
5121         bus = kvm_get_bus(kvm, bus_idx);
5122         if (!bus)
5123                 return -ENOMEM;
5124
5125         /* exclude ioeventfd which is limited by maximum fd */
5126         if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
5127                 return -ENOSPC;
5128
5129         new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
5130                           GFP_KERNEL_ACCOUNT);
5131         if (!new_bus)
5132                 return -ENOMEM;
5133
5134         range = (struct kvm_io_range) {
5135                 .addr = addr,
5136                 .len = len,
5137                 .dev = dev,
5138         };
5139
5140         for (i = 0; i < bus->dev_count; i++)
5141                 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
5142                         break;
5143
5144         memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
5145         new_bus->dev_count++;
5146         new_bus->range[i] = range;
5147         memcpy(new_bus->range + i + 1, bus->range + i,
5148                 (bus->dev_count - i) * sizeof(struct kvm_io_range));
5149         rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5150         synchronize_srcu_expedited(&kvm->srcu);
5151         kfree(bus);
5152
5153         return 0;
5154 }
5155
5156 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5157                               struct kvm_io_device *dev)
5158 {
5159         int i, j;
5160         struct kvm_io_bus *new_bus, *bus;
5161
5162         lockdep_assert_held(&kvm->slots_lock);
5163
5164         bus = kvm_get_bus(kvm, bus_idx);
5165         if (!bus)
5166                 return 0;
5167
5168         for (i = 0; i < bus->dev_count; i++) {
5169                 if (bus->range[i].dev == dev) {
5170                         break;
5171                 }
5172         }
5173
5174         if (i == bus->dev_count)
5175                 return 0;
5176
5177         new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
5178                           GFP_KERNEL_ACCOUNT);
5179         if (new_bus) {
5180                 memcpy(new_bus, bus, struct_size(bus, range, i));
5181                 new_bus->dev_count--;
5182                 memcpy(new_bus->range + i, bus->range + i + 1,
5183                                 flex_array_size(new_bus, range, new_bus->dev_count - i));
5184         }
5185
5186         rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5187         synchronize_srcu_expedited(&kvm->srcu);
5188
5189         /* Destroy the old bus _after_ installing the (null) bus. */
5190         if (!new_bus) {
5191                 pr_err("kvm: failed to shrink bus, removing it completely\n");
5192                 for (j = 0; j < bus->dev_count; j++) {
5193                         if (j == i)
5194                                 continue;
5195                         kvm_iodevice_destructor(bus->range[j].dev);
5196                 }
5197         }
5198
5199         kfree(bus);
5200         return new_bus ? 0 : -ENOMEM;
5201 }
5202
5203 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5204                                          gpa_t addr)
5205 {
5206         struct kvm_io_bus *bus;
5207         int dev_idx, srcu_idx;
5208         struct kvm_io_device *iodev = NULL;
5209
5210         srcu_idx = srcu_read_lock(&kvm->srcu);
5211
5212         bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
5213         if (!bus)
5214                 goto out_unlock;
5215
5216         dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
5217         if (dev_idx < 0)
5218                 goto out_unlock;
5219
5220         iodev = bus->range[dev_idx].dev;
5221
5222 out_unlock:
5223         srcu_read_unlock(&kvm->srcu, srcu_idx);
5224
5225         return iodev;
5226 }
5227 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
5228
5229 static int kvm_debugfs_open(struct inode *inode, struct file *file,
5230                            int (*get)(void *, u64 *), int (*set)(void *, u64),
5231                            const char *fmt)
5232 {
5233         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5234                                           inode->i_private;
5235
5236         /*
5237          * The debugfs files are a reference to the kvm struct which
5238         * is still valid when kvm_destroy_vm is called.  kvm_get_kvm_safe
5239         * avoids the race between open and the removal of the debugfs directory.
5240          */
5241         if (!kvm_get_kvm_safe(stat_data->kvm))
5242                 return -ENOENT;
5243
5244         if (simple_attr_open(inode, file, get,
5245                     kvm_stats_debugfs_mode(stat_data->desc) & 0222
5246                     ? set : NULL,
5247                     fmt)) {
5248                 kvm_put_kvm(stat_data->kvm);
5249                 return -ENOMEM;
5250         }
5251
5252         return 0;
5253 }
5254
5255 static int kvm_debugfs_release(struct inode *inode, struct file *file)
5256 {
5257         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5258                                           inode->i_private;
5259
5260         simple_attr_release(inode, file);
5261         kvm_put_kvm(stat_data->kvm);
5262
5263         return 0;
5264 }
5265
5266 static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
5267 {
5268         *val = *(u64 *)((void *)(&kvm->stat) + offset);
5269
5270         return 0;
5271 }
5272
5273 static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
5274 {
5275         *(u64 *)((void *)(&kvm->stat) + offset) = 0;
5276
5277         return 0;
5278 }
5279
5280 static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
5281 {
5282         unsigned long i;
5283         struct kvm_vcpu *vcpu;
5284
5285         *val = 0;
5286
5287         kvm_for_each_vcpu(i, vcpu, kvm)
5288                 *val += *(u64 *)((void *)(&vcpu->stat) + offset);
5289
5290         return 0;
5291 }
5292
5293 static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
5294 {
5295         unsigned long i;
5296         struct kvm_vcpu *vcpu;
5297
5298         kvm_for_each_vcpu(i, vcpu, kvm)
5299                 *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
5300
5301         return 0;
5302 }
5303
5304 static int kvm_stat_data_get(void *data, u64 *val)
5305 {
5306         int r = -EFAULT;
5307         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5308
5309         switch (stat_data->kind) {
5310         case KVM_STAT_VM:
5311                 r = kvm_get_stat_per_vm(stat_data->kvm,
5312                                         stat_data->desc->desc.offset, val);
5313                 break;
5314         case KVM_STAT_VCPU:
5315                 r = kvm_get_stat_per_vcpu(stat_data->kvm,
5316                                           stat_data->desc->desc.offset, val);
5317                 break;
5318         }
5319
5320         return r;
5321 }
5322
5323 static int kvm_stat_data_clear(void *data, u64 val)
5324 {
5325         int r = -EFAULT;
5326         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5327
5328         if (val)
5329                 return -EINVAL;
5330
5331         switch (stat_data->kind) {
5332         case KVM_STAT_VM:
5333                 r = kvm_clear_stat_per_vm(stat_data->kvm,
5334                                           stat_data->desc->desc.offset);
5335                 break;
5336         case KVM_STAT_VCPU:
5337                 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
5338                                             stat_data->desc->desc.offset);
5339                 break;
5340         }
5341
5342         return r;
5343 }
5344
5345 static int kvm_stat_data_open(struct inode *inode, struct file *file)
5346 {
5347         __simple_attr_check_format("%llu\n", 0ull);
5348         return kvm_debugfs_open(inode, file, kvm_stat_data_get,
5349                                 kvm_stat_data_clear, "%llu\n");
5350 }
5351
5352 static const struct file_operations stat_fops_per_vm = {
5353         .owner = THIS_MODULE,
5354         .open = kvm_stat_data_open,
5355         .release = kvm_debugfs_release,
5356         .read = simple_attr_read,
5357         .write = simple_attr_write,
5358         .llseek = no_llseek,
5359 };
5360
5361 static int vm_stat_get(void *_offset, u64 *val)
5362 {
5363         unsigned offset = (long)_offset;
5364         struct kvm *kvm;
5365         u64 tmp_val;
5366
5367         *val = 0;
5368         mutex_lock(&kvm_lock);
5369         list_for_each_entry(kvm, &vm_list, vm_list) {
5370                 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
5371                 *val += tmp_val;
5372         }
5373         mutex_unlock(&kvm_lock);
5374         return 0;
5375 }
5376
5377 static int vm_stat_clear(void *_offset, u64 val)
5378 {
5379         unsigned offset = (long)_offset;
5380         struct kvm *kvm;
5381
5382         if (val)
5383                 return -EINVAL;
5384
5385         mutex_lock(&kvm_lock);
5386         list_for_each_entry(kvm, &vm_list, vm_list) {
5387                 kvm_clear_stat_per_vm(kvm, offset);
5388         }
5389         mutex_unlock(&kvm_lock);
5390
5391         return 0;
5392 }
5393
5394 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
5395 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
5396
5397 static int vcpu_stat_get(void *_offset, u64 *val)
5398 {
5399         unsigned offset = (long)_offset;
5400         struct kvm *kvm;
5401         u64 tmp_val;
5402
5403         *val = 0;
5404         mutex_lock(&kvm_lock);
5405         list_for_each_entry(kvm, &vm_list, vm_list) {
5406                 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
5407                 *val += tmp_val;
5408         }
5409         mutex_unlock(&kvm_lock);
5410         return 0;
5411 }
5412
5413 static int vcpu_stat_clear(void *_offset, u64 val)
5414 {
5415         unsigned offset = (long)_offset;
5416         struct kvm *kvm;
5417
5418         if (val)
5419                 return -EINVAL;
5420
5421         mutex_lock(&kvm_lock);
5422         list_for_each_entry(kvm, &vm_list, vm_list) {
5423                 kvm_clear_stat_per_vcpu(kvm, offset);
5424         }
5425         mutex_unlock(&kvm_lock);
5426
5427         return 0;
5428 }
5429
5430 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
5431                         "%llu\n");
5432 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
5433
5434 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
5435 {
5436         struct kobj_uevent_env *env;
5437         unsigned long long created, active;
5438
5439         if (!kvm_dev.this_device || !kvm)
5440                 return;
5441
5442         mutex_lock(&kvm_lock);
5443         if (type == KVM_EVENT_CREATE_VM) {
5444                 kvm_createvm_count++;
5445                 kvm_active_vms++;
5446         } else if (type == KVM_EVENT_DESTROY_VM) {
5447                 kvm_active_vms--;
5448         }
5449         created = kvm_createvm_count;
5450         active = kvm_active_vms;
5451         mutex_unlock(&kvm_lock);
5452
5453         env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
5454         if (!env)
5455                 return;
5456
5457         add_uevent_var(env, "CREATED=%llu", created);
5458         add_uevent_var(env, "COUNT=%llu", active);
5459
5460         if (type == KVM_EVENT_CREATE_VM) {
5461                 add_uevent_var(env, "EVENT=create");
5462                 kvm->userspace_pid = task_pid_nr(current);
5463         } else if (type == KVM_EVENT_DESTROY_VM) {
5464                 add_uevent_var(env, "EVENT=destroy");
5465         }
5466         add_uevent_var(env, "PID=%d", kvm->userspace_pid);
5467
5468         if (kvm->debugfs_dentry) {
5469                 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
5470
5471                 if (p) {
5472                         tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
5473                         if (!IS_ERR(tmp))
5474                                 add_uevent_var(env, "STATS_PATH=%s", tmp);
5475                         kfree(p);
5476                 }
5477         }
5478         /* no need for checks, since we are adding at most only 5 keys */
5479         env->envp[env->envp_idx++] = NULL;
5480         kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
5481         kfree(env);
5482 }
5483
5484 static void kvm_init_debug(void)
5485 {
5486         const struct file_operations *fops;
5487         const struct _kvm_stats_desc *pdesc;
5488         int i;
5489
5490         kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
5491
5492         for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
5493                 pdesc = &kvm_vm_stats_desc[i];
5494                 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5495                         fops = &vm_stat_fops;
5496                 else
5497                         fops = &vm_stat_readonly_fops;
5498                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5499                                 kvm_debugfs_dir,
5500                                 (void *)(long)pdesc->desc.offset, fops);
5501         }
5502
5503         for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
5504                 pdesc = &kvm_vcpu_stats_desc[i];
5505                 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5506                         fops = &vcpu_stat_fops;
5507                 else
5508                         fops = &vcpu_stat_readonly_fops;
5509                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5510                                 kvm_debugfs_dir,
5511                                 (void *)(long)pdesc->desc.offset, fops);
5512         }
5513 }
5514
5515 static int kvm_suspend(void)
5516 {
5517         if (kvm_usage_count)
5518                 hardware_disable_nolock(NULL);
5519         return 0;
5520 }
5521
5522 static void kvm_resume(void)
5523 {
5524         if (kvm_usage_count) {
5525                 lockdep_assert_not_held(&kvm_count_lock);
5526                 hardware_enable_nolock(NULL);
5527         }
5528 }
5529
5530 static struct syscore_ops kvm_syscore_ops = {
5531         .suspend = kvm_suspend,
5532         .resume = kvm_resume,
5533 };
5534
5535 static inline
5536 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
5537 {
5538         return container_of(pn, struct kvm_vcpu, preempt_notifier);
5539 }
5540
5541 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
5542 {
5543         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5544
5545         WRITE_ONCE(vcpu->preempted, false);
5546         WRITE_ONCE(vcpu->ready, false);
5547
5548         __this_cpu_write(kvm_running_vcpu, vcpu);
5549         kvm_arch_sched_in(vcpu, cpu);
5550         kvm_arch_vcpu_load(vcpu, cpu);
5551 }
5552
5553 static void kvm_sched_out(struct preempt_notifier *pn,
5554                           struct task_struct *next)
5555 {
5556         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5557
5558         if (current->on_rq) {
5559                 WRITE_ONCE(vcpu->preempted, true);
5560                 WRITE_ONCE(vcpu->ready, true);
5561         }
5562         kvm_arch_vcpu_put(vcpu);
5563         __this_cpu_write(kvm_running_vcpu, NULL);
5564 }
5565
5566 /**
5567  * kvm_get_running_vcpu - get the vcpu running on the current CPU.
5568  *
5569  * We can disable preemption locally around accessing the per-CPU variable,
5570  * and use the resolved vcpu pointer after enabling preemption again,
5571  * because even if the current thread is migrated to another CPU, reading
5572  * the per-CPU value later will give us the same value as we update the
5573  * per-CPU variable in the preempt notifier handlers.
5574  */
5575 struct kvm_vcpu *kvm_get_running_vcpu(void)
5576 {
5577         struct kvm_vcpu *vcpu;
5578
5579         preempt_disable();
5580         vcpu = __this_cpu_read(kvm_running_vcpu);
5581         preempt_enable();
5582
5583         return vcpu;
5584 }
5585 EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
5586
5587 /**
5588  * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
5589  */
5590 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
5591 {
5592         return &kvm_running_vcpu;
5593 }
5594
5595 #ifdef CONFIG_GUEST_PERF_EVENTS
5596 static unsigned int kvm_guest_state(void)
5597 {
5598         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
5599         unsigned int state;
5600
5601         if (!kvm_arch_pmi_in_guest(vcpu))
5602                 return 0;
5603
5604         state = PERF_GUEST_ACTIVE;
5605         if (!kvm_arch_vcpu_in_kernel(vcpu))
5606                 state |= PERF_GUEST_USER;
5607
5608         return state;
5609 }
5610
5611 static unsigned long kvm_guest_get_ip(void)
5612 {
5613         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
5614
5615         /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
5616         if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
5617                 return 0;
5618
5619         return kvm_arch_vcpu_get_ip(vcpu);
5620 }
5621
5622 static struct perf_guest_info_callbacks kvm_guest_cbs = {
5623         .state                  = kvm_guest_state,
5624         .get_ip                 = kvm_guest_get_ip,
5625         .handle_intel_pt_intr   = NULL,
5626 };
5627
5628 void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
5629 {
5630         kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
5631         perf_register_guest_info_callbacks(&kvm_guest_cbs);
5632 }
5633 void kvm_unregister_perf_callbacks(void)
5634 {
5635         perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
5636 }
5637 #endif
5638
5639 struct kvm_cpu_compat_check {
5640         void *opaque;
5641         int *ret;
5642 };
5643
5644 static void check_processor_compat(void *data)
5645 {
5646         struct kvm_cpu_compat_check *c = data;
5647
5648         *c->ret = kvm_arch_check_processor_compat(c->opaque);
5649 }
5650
5651 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
5652                   struct module *module)
5653 {
5654         struct kvm_cpu_compat_check c;
5655         int r;
5656         int cpu;
5657
5658         r = kvm_arch_init(opaque);
5659         if (r)
5660                 goto out_fail;
5661
5662         /*
5663          * kvm_arch_init makes sure there's at most one caller
5664          * for architectures that support multiple implementations,
5665          * like intel and amd on x86.
5666          * kvm_arch_init must be called before kvm_irqfd_init to avoid creating
5667          * conflicts in case kvm is already setup for another implementation.
5668          */
5669         r = kvm_irqfd_init();
5670         if (r)
5671                 goto out_irqfd;
5672
5673         if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
5674                 r = -ENOMEM;
5675                 goto out_free_0;
5676         }
5677
5678         r = kvm_arch_hardware_setup(opaque);
5679         if (r < 0)
5680                 goto out_free_1;
5681
5682         c.ret = &r;
5683         c.opaque = opaque;
5684         for_each_online_cpu(cpu) {
5685                 smp_call_function_single(cpu, check_processor_compat, &c, 1);
5686                 if (r < 0)
5687                         goto out_free_2;
5688         }
5689
5690         r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
5691                                       kvm_starting_cpu, kvm_dying_cpu);
5692         if (r)
5693                 goto out_free_2;
5694         register_reboot_notifier(&kvm_reboot_notifier);
5695
5696         /* A kmem cache lets us meet the alignment requirements of fx_save. */
5697         if (!vcpu_align)
5698                 vcpu_align = __alignof__(struct kvm_vcpu);
5699         kvm_vcpu_cache =
5700                 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
5701                                            SLAB_ACCOUNT,
5702                                            offsetof(struct kvm_vcpu, arch),
5703                                            offsetofend(struct kvm_vcpu, stats_id)
5704                                            - offsetof(struct kvm_vcpu, arch),
5705                                            NULL);
5706         if (!kvm_vcpu_cache) {
5707                 r = -ENOMEM;
5708                 goto out_free_3;
5709         }
5710
5711         for_each_possible_cpu(cpu) {
5712                 if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
5713                                             GFP_KERNEL, cpu_to_node(cpu))) {
5714                         r = -ENOMEM;
5715                         goto out_free_4;
5716                 }
5717         }
5718
5719         r = kvm_async_pf_init();
5720         if (r)
5721                 goto out_free_5;
5722
5723         kvm_chardev_ops.owner = module;
5724         kvm_vm_fops.owner = module;
5725         kvm_vcpu_fops.owner = module;
5726
5727         r = misc_register(&kvm_dev);
5728         if (r) {
5729                 pr_err("kvm: misc device register failed\n");
5730                 goto out_unreg;
5731         }
5732
5733         register_syscore_ops(&kvm_syscore_ops);
5734
5735         kvm_preempt_ops.sched_in = kvm_sched_in;
5736         kvm_preempt_ops.sched_out = kvm_sched_out;
5737
5738         kvm_init_debug();
5739
5740         r = kvm_vfio_ops_init();
5741         WARN_ON(r);
5742
5743         return 0;
5744
5745 out_unreg:
5746         kvm_async_pf_deinit();
5747 out_free_5:
5748         for_each_possible_cpu(cpu)
5749                 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
5750 out_free_4:
5751         kmem_cache_destroy(kvm_vcpu_cache);
5752 out_free_3:
5753         unregister_reboot_notifier(&kvm_reboot_notifier);
5754         cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5755 out_free_2:
5756         kvm_arch_hardware_unsetup();
5757 out_free_1:
5758         free_cpumask_var(cpus_hardware_enabled);
5759 out_free_0:
5760         kvm_irqfd_exit();
5761 out_irqfd:
5762         kvm_arch_exit();
5763 out_fail:
5764         return r;
5765 }
5766 EXPORT_SYMBOL_GPL(kvm_init);
5767
5768 void kvm_exit(void)
5769 {
5770         int cpu;
5771
5772         debugfs_remove_recursive(kvm_debugfs_dir);
5773         misc_deregister(&kvm_dev);
5774         for_each_possible_cpu(cpu)
5775                 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
5776         kmem_cache_destroy(kvm_vcpu_cache);
5777         kvm_async_pf_deinit();
5778         unregister_syscore_ops(&kvm_syscore_ops);
5779         unregister_reboot_notifier(&kvm_reboot_notifier);
5780         cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5781         on_each_cpu(hardware_disable_nolock, NULL, 1);
5782         kvm_arch_hardware_unsetup();
5783         kvm_arch_exit();
5784         kvm_irqfd_exit();
5785         free_cpumask_var(cpus_hardware_enabled);
5786         kvm_vfio_ops_exit();
5787 }
5788 EXPORT_SYMBOL_GPL(kvm_exit);
5789
5790 struct kvm_vm_worker_thread_context {
5791         struct kvm *kvm;
5792         struct task_struct *parent;
5793         struct completion init_done;
5794         kvm_vm_thread_fn_t thread_fn;
5795         uintptr_t data;
5796         int err;
5797 };
5798
5799 static int kvm_vm_worker_thread(void *context)
5800 {
5801         /*
5802          * The init_context is allocated on the stack of the parent thread, so
5803          * we have to locally copy anything that is needed beyond initialization
5804          */
5805         struct kvm_vm_worker_thread_context *init_context = context;
5806         struct task_struct *parent;
5807         struct kvm *kvm = init_context->kvm;
5808         kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
5809         uintptr_t data = init_context->data;
5810         int err;
5811
5812         err = kthread_park(current);
5813         /* kthread_park(current) is never supposed to return an error */
5814         WARN_ON(err != 0);
5815         if (err)
5816                 goto init_complete;
5817
5818         err = cgroup_attach_task_all(init_context->parent, current);
5819         if (err) {
5820                 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
5821                         __func__, err);
5822                 goto init_complete;
5823         }
5824
5825         set_user_nice(current, task_nice(init_context->parent));
5826
5827 init_complete:
5828         init_context->err = err;
5829         complete(&init_context->init_done);
5830         init_context = NULL;
5831
5832         if (err)
5833                 goto out;
5834
5835         /* Wait to be woken up by the spawner before proceeding. */
5836         kthread_parkme();
5837
5838         if (!kthread_should_stop())
5839                 err = thread_fn(kvm, data);
5840
5841 out:
5842         /*
5843          * Move kthread back to its original cgroup to prevent it lingering in
5844          * the cgroup of the VM process, after the latter finishes its
5845          * execution.
5846          *
5847          * kthread_stop() waits on the 'exited' completion condition which is
5848          * set in exit_mm(), via mm_release(), in do_exit(). However, the
5849          * kthread is removed from the cgroup in the cgroup_exit() which is
5850          * called after the exit_mm(). This causes the kthread_stop() to return
5851          * before the kthread actually quits the cgroup.
5852          */
5853         rcu_read_lock();
5854         parent = rcu_dereference(current->real_parent);
5855         get_task_struct(parent);
5856         rcu_read_unlock();
5857         cgroup_attach_task_all(parent, current);
5858         put_task_struct(parent);
5859
5860         return err;
5861 }
5862
5863 int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
5864                                 uintptr_t data, const char *name,
5865                                 struct task_struct **thread_ptr)
5866 {
5867         struct kvm_vm_worker_thread_context init_context = {};
5868         struct task_struct *thread;
5869
5870         *thread_ptr = NULL;
5871         init_context.kvm = kvm;
5872         init_context.parent = current;
5873         init_context.thread_fn = thread_fn;
5874         init_context.data = data;
5875         init_completion(&init_context.init_done);
5876
5877         thread = kthread_run(kvm_vm_worker_thread, &init_context,
5878                              "%s-%d", name, task_pid_nr(current));
5879         if (IS_ERR(thread))
5880                 return PTR_ERR(thread);
5881
5882         /* kthread_run is never supposed to return NULL */
5883         WARN_ON(thread == NULL);
5884
5885         wait_for_completion(&init_context.init_done);
5886
5887         if (!init_context.err)
5888                 *thread_ptr = thread;
5889
5890         return init_context.err;
5891 }