045458e7192a48b9f10869f8bf7a3f7f25af7860
[linux-2.6-microblaze.git] / arch / powerpc / kvm / book3s_hv.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
4  * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
5  *
6  * Authors:
7  *    Paul Mackerras <paulus@au1.ibm.com>
8  *    Alexander Graf <agraf@suse.de>
9  *    Kevin Wolf <mail@kevin-wolf.de>
10  *
11  * Description: KVM functions specific to running on Book 3S
12  * processors in hypervisor mode (specifically POWER7 and later).
13  *
14  * This file is derived from arch/powerpc/kvm/book3s.c,
15  * by Alexander Graf <agraf@suse.de>.
16  */
17
18 #include <linux/kvm_host.h>
19 #include <linux/kernel.h>
20 #include <linux/err.h>
21 #include <linux/slab.h>
22 #include <linux/preempt.h>
23 #include <linux/sched/signal.h>
24 #include <linux/sched/stat.h>
25 #include <linux/delay.h>
26 #include <linux/export.h>
27 #include <linux/fs.h>
28 #include <linux/anon_inodes.h>
29 #include <linux/cpu.h>
30 #include <linux/cpumask.h>
31 #include <linux/spinlock.h>
32 #include <linux/page-flags.h>
33 #include <linux/srcu.h>
34 #include <linux/miscdevice.h>
35 #include <linux/debugfs.h>
36 #include <linux/gfp.h>
37 #include <linux/vmalloc.h>
38 #include <linux/highmem.h>
39 #include <linux/hugetlb.h>
40 #include <linux/kvm_irqfd.h>
41 #include <linux/irqbypass.h>
42 #include <linux/module.h>
43 #include <linux/compiler.h>
44 #include <linux/of.h>
45
46 #include <asm/ftrace.h>
47 #include <asm/reg.h>
48 #include <asm/ppc-opcode.h>
49 #include <asm/asm-prototypes.h>
50 #include <asm/archrandom.h>
51 #include <asm/debug.h>
52 #include <asm/disassemble.h>
53 #include <asm/cputable.h>
54 #include <asm/cacheflush.h>
55 #include <linux/uaccess.h>
56 #include <asm/interrupt.h>
57 #include <asm/io.h>
58 #include <asm/kvm_ppc.h>
59 #include <asm/kvm_book3s.h>
60 #include <asm/mmu_context.h>
61 #include <asm/lppaca.h>
62 #include <asm/processor.h>
63 #include <asm/cputhreads.h>
64 #include <asm/page.h>
65 #include <asm/hvcall.h>
66 #include <asm/switch_to.h>
67 #include <asm/smp.h>
68 #include <asm/dbell.h>
69 #include <asm/hmi.h>
70 #include <asm/pnv-pci.h>
71 #include <asm/mmu.h>
72 #include <asm/opal.h>
73 #include <asm/xics.h>
74 #include <asm/xive.h>
75 #include <asm/hw_breakpoint.h>
76 #include <asm/kvm_book3s_uvmem.h>
77 #include <asm/ultravisor.h>
78 #include <asm/dtl.h>
79
80 #include "book3s.h"
81
82 #define CREATE_TRACE_POINTS
83 #include "trace_hv.h"
84
85 /* #define EXIT_DEBUG */
86 /* #define EXIT_DEBUG_SIMPLE */
87 /* #define EXIT_DEBUG_INT */
88
89 /* Used to indicate that a guest page fault needs to be handled */
90 #define RESUME_PAGE_FAULT       (RESUME_GUEST | RESUME_FLAG_ARCH1)
91 /* Used to indicate that a guest passthrough interrupt needs to be handled */
92 #define RESUME_PASSTHROUGH      (RESUME_GUEST | RESUME_FLAG_ARCH2)
93
94 /* Used as a "null" value for timebase values */
95 #define TB_NIL  (~(u64)0)
96
97 static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
98
99 static int dynamic_mt_modes = 6;
100 module_param(dynamic_mt_modes, int, 0644);
101 MODULE_PARM_DESC(dynamic_mt_modes, "Set of allowed dynamic micro-threading modes: 0 (= none), 2, 4, or 6 (= 2 or 4)");
102 static int target_smt_mode;
103 module_param(target_smt_mode, int, 0644);
104 MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
105
106 static bool one_vm_per_core;
107 module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR);
108 MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires POWER8 or older)");
109
110 #ifdef CONFIG_KVM_XICS
111 static const struct kernel_param_ops module_param_ops = {
112         .set = param_set_int,
113         .get = param_get_int,
114 };
115
116 module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass, 0644);
117 MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization");
118
119 module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, 0644);
120 MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
121 #endif
122
123 /* If set, guests are allowed to create and control nested guests */
124 static bool nested = true;
125 module_param(nested, bool, S_IRUGO | S_IWUSR);
126 MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)");
127
128 static inline bool nesting_enabled(struct kvm *kvm)
129 {
130         return kvm->arch.nested_enable && kvm_is_radix(kvm);
131 }
132
133 /* If set, the threads on each CPU core have to be in the same MMU mode */
134 static bool no_mixing_hpt_and_radix __read_mostly;
135
136 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
137
138 /*
139  * RWMR values for POWER8.  These control the rate at which PURR
140  * and SPURR count and should be set according to the number of
141  * online threads in the vcore being run.
142  */
143 #define RWMR_RPA_P8_1THREAD     0x164520C62609AECAUL
144 #define RWMR_RPA_P8_2THREAD     0x7FFF2908450D8DA9UL
145 #define RWMR_RPA_P8_3THREAD     0x164520C62609AECAUL
146 #define RWMR_RPA_P8_4THREAD     0x199A421245058DA9UL
147 #define RWMR_RPA_P8_5THREAD     0x164520C62609AECAUL
148 #define RWMR_RPA_P8_6THREAD     0x164520C62609AECAUL
149 #define RWMR_RPA_P8_7THREAD     0x164520C62609AECAUL
150 #define RWMR_RPA_P8_8THREAD     0x164520C62609AECAUL
151
152 static unsigned long p8_rwmr_values[MAX_SMT_THREADS + 1] = {
153         RWMR_RPA_P8_1THREAD,
154         RWMR_RPA_P8_1THREAD,
155         RWMR_RPA_P8_2THREAD,
156         RWMR_RPA_P8_3THREAD,
157         RWMR_RPA_P8_4THREAD,
158         RWMR_RPA_P8_5THREAD,
159         RWMR_RPA_P8_6THREAD,
160         RWMR_RPA_P8_7THREAD,
161         RWMR_RPA_P8_8THREAD,
162 };
163
164 static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
165                 int *ip)
166 {
167         int i = *ip;
168         struct kvm_vcpu *vcpu;
169
170         while (++i < MAX_SMT_THREADS) {
171                 vcpu = READ_ONCE(vc->runnable_threads[i]);
172                 if (vcpu) {
173                         *ip = i;
174                         return vcpu;
175                 }
176         }
177         return NULL;
178 }
179
180 /* Used to traverse the list of runnable threads for a given vcore */
181 #define for_each_runnable_thread(i, vcpu, vc) \
182         for (i = -1; (vcpu = next_runnable_thread(vc, &i)); )
183
184 static bool kvmppc_ipi_thread(int cpu)
185 {
186         unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
187
188         /* If we're a nested hypervisor, fall back to ordinary IPIs for now */
189         if (kvmhv_on_pseries())
190                 return false;
191
192         /* On POWER9 we can use msgsnd to IPI any cpu */
193         if (cpu_has_feature(CPU_FTR_ARCH_300)) {
194                 msg |= get_hard_smp_processor_id(cpu);
195                 smp_mb();
196                 __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
197                 return true;
198         }
199
200         /* On POWER8 for IPIs to threads in the same core, use msgsnd */
201         if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
202                 preempt_disable();
203                 if (cpu_first_thread_sibling(cpu) ==
204                     cpu_first_thread_sibling(smp_processor_id())) {
205                         msg |= cpu_thread_in_core(cpu);
206                         smp_mb();
207                         __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
208                         preempt_enable();
209                         return true;
210                 }
211                 preempt_enable();
212         }
213
214 #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
215         if (cpu >= 0 && cpu < nr_cpu_ids) {
216                 if (paca_ptrs[cpu]->kvm_hstate.xics_phys) {
217                         xics_wake_cpu(cpu);
218                         return true;
219                 }
220                 opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
221                 return true;
222         }
223 #endif
224
225         return false;
226 }
227
228 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
229 {
230         int cpu;
231         struct rcuwait *waitp;
232
233         waitp = kvm_arch_vcpu_get_wait(vcpu);
234         if (rcuwait_wake_up(waitp))
235                 ++vcpu->stat.halt_wakeup;
236
237         cpu = READ_ONCE(vcpu->arch.thread_cpu);
238         if (cpu >= 0 && kvmppc_ipi_thread(cpu))
239                 return;
240
241         /* CPU points to the first thread of the core */
242         cpu = vcpu->cpu;
243         if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
244                 smp_send_reschedule(cpu);
245 }
246
247 /*
248  * We use the vcpu_load/put functions to measure stolen time.
249  * Stolen time is counted as time when either the vcpu is able to
250  * run as part of a virtual core, but the task running the vcore
251  * is preempted or sleeping, or when the vcpu needs something done
252  * in the kernel by the task running the vcpu, but that task is
253  * preempted or sleeping.  Those two things have to be counted
254  * separately, since one of the vcpu tasks will take on the job
255  * of running the core, and the other vcpu tasks in the vcore will
256  * sleep waiting for it to do that, but that sleep shouldn't count
257  * as stolen time.
258  *
259  * Hence we accumulate stolen time when the vcpu can run as part of
260  * a vcore using vc->stolen_tb, and the stolen time when the vcpu
261  * needs its task to do other things in the kernel (for example,
262  * service a page fault) in busy_stolen.  We don't accumulate
263  * stolen time for a vcore when it is inactive, or for a vcpu
264  * when it is in state RUNNING or NOTREADY.  NOTREADY is a bit of
265  * a misnomer; it means that the vcpu task is not executing in
266  * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in
267  * the kernel.  We don't have any way of dividing up that time
268  * between time that the vcpu is genuinely stopped, time that
269  * the task is actively working on behalf of the vcpu, and time
270  * that the task is preempted, so we don't count any of it as
271  * stolen.
272  *
273  * Updates to busy_stolen are protected by arch.tbacct_lock;
274  * updates to vc->stolen_tb are protected by the vcore->stoltb_lock
275  * lock.  The stolen times are measured in units of timebase ticks.
276  * (Note that the != TB_NIL checks below are purely defensive;
277  * they should never fail.)
278  */
279
280 static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc)
281 {
282         unsigned long flags;
283
284         spin_lock_irqsave(&vc->stoltb_lock, flags);
285         vc->preempt_tb = mftb();
286         spin_unlock_irqrestore(&vc->stoltb_lock, flags);
287 }
288
289 static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc)
290 {
291         unsigned long flags;
292
293         spin_lock_irqsave(&vc->stoltb_lock, flags);
294         if (vc->preempt_tb != TB_NIL) {
295                 vc->stolen_tb += mftb() - vc->preempt_tb;
296                 vc->preempt_tb = TB_NIL;
297         }
298         spin_unlock_irqrestore(&vc->stoltb_lock, flags);
299 }
300
301 static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
302 {
303         struct kvmppc_vcore *vc = vcpu->arch.vcore;
304         unsigned long flags;
305
306         /*
307          * We can test vc->runner without taking the vcore lock,
308          * because only this task ever sets vc->runner to this
309          * vcpu, and once it is set to this vcpu, only this task
310          * ever sets it to NULL.
311          */
312         if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
313                 kvmppc_core_end_stolen(vc);
314
315         spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
316         if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
317             vcpu->arch.busy_preempt != TB_NIL) {
318                 vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt;
319                 vcpu->arch.busy_preempt = TB_NIL;
320         }
321         spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
322 }
323
324 static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
325 {
326         struct kvmppc_vcore *vc = vcpu->arch.vcore;
327         unsigned long flags;
328
329         if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
330                 kvmppc_core_start_stolen(vc);
331
332         spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
333         if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
334                 vcpu->arch.busy_preempt = mftb();
335         spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
336 }
337
338 static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
339 {
340         vcpu->arch.pvr = pvr;
341 }
342
343 /* Dummy value used in computing PCR value below */
344 #define PCR_ARCH_31    (PCR_ARCH_300 << 1)
345
346 static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
347 {
348         unsigned long host_pcr_bit = 0, guest_pcr_bit = 0;
349         struct kvmppc_vcore *vc = vcpu->arch.vcore;
350
351         /* We can (emulate) our own architecture version and anything older */
352         if (cpu_has_feature(CPU_FTR_ARCH_31))
353                 host_pcr_bit = PCR_ARCH_31;
354         else if (cpu_has_feature(CPU_FTR_ARCH_300))
355                 host_pcr_bit = PCR_ARCH_300;
356         else if (cpu_has_feature(CPU_FTR_ARCH_207S))
357                 host_pcr_bit = PCR_ARCH_207;
358         else if (cpu_has_feature(CPU_FTR_ARCH_206))
359                 host_pcr_bit = PCR_ARCH_206;
360         else
361                 host_pcr_bit = PCR_ARCH_205;
362
363         /* Determine lowest PCR bit needed to run guest in given PVR level */
364         guest_pcr_bit = host_pcr_bit;
365         if (arch_compat) {
366                 switch (arch_compat) {
367                 case PVR_ARCH_205:
368                         guest_pcr_bit = PCR_ARCH_205;
369                         break;
370                 case PVR_ARCH_206:
371                 case PVR_ARCH_206p:
372                         guest_pcr_bit = PCR_ARCH_206;
373                         break;
374                 case PVR_ARCH_207:
375                         guest_pcr_bit = PCR_ARCH_207;
376                         break;
377                 case PVR_ARCH_300:
378                         guest_pcr_bit = PCR_ARCH_300;
379                         break;
380                 case PVR_ARCH_31:
381                         guest_pcr_bit = PCR_ARCH_31;
382                         break;
383                 default:
384                         return -EINVAL;
385                 }
386         }
387
388         /* Check requested PCR bits don't exceed our capabilities */
389         if (guest_pcr_bit > host_pcr_bit)
390                 return -EINVAL;
391
392         spin_lock(&vc->lock);
393         vc->arch_compat = arch_compat;
394         /*
395          * Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit
396          * Also set all reserved PCR bits
397          */
398         vc->pcr = (host_pcr_bit - guest_pcr_bit) | PCR_MASK;
399         spin_unlock(&vc->lock);
400
401         return 0;
402 }
403
404 static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
405 {
406         int r;
407
408         pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
409         pr_err("pc  = %.16lx  msr = %.16llx  trap = %x\n",
410                vcpu->arch.regs.nip, vcpu->arch.shregs.msr, vcpu->arch.trap);
411         for (r = 0; r < 16; ++r)
412                 pr_err("r%2d = %.16lx  r%d = %.16lx\n",
413                        r, kvmppc_get_gpr(vcpu, r),
414                        r+16, kvmppc_get_gpr(vcpu, r+16));
415         pr_err("ctr = %.16lx  lr  = %.16lx\n",
416                vcpu->arch.regs.ctr, vcpu->arch.regs.link);
417         pr_err("srr0 = %.16llx srr1 = %.16llx\n",
418                vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1);
419         pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
420                vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
421         pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
422                vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
423         pr_err("cr = %.8lx  xer = %.16lx  dsisr = %.8x\n",
424                vcpu->arch.regs.ccr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
425         pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
426         pr_err("fault dar = %.16lx dsisr = %.8x\n",
427                vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
428         pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
429         for (r = 0; r < vcpu->arch.slb_max; ++r)
430                 pr_err("  ESID = %.16llx VSID = %.16llx\n",
431                        vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
432         pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
433                vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1,
434                vcpu->arch.last_inst);
435 }
436
437 static struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
438 {
439         return kvm_get_vcpu_by_id(kvm, id);
440 }
441
442 static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
443 {
444         vpa->__old_status |= LPPACA_OLD_SHARED_PROC;
445         vpa->yield_count = cpu_to_be32(1);
446 }
447
448 static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v,
449                    unsigned long addr, unsigned long len)
450 {
451         /* check address is cacheline aligned */
452         if (addr & (L1_CACHE_BYTES - 1))
453                 return -EINVAL;
454         spin_lock(&vcpu->arch.vpa_update_lock);
455         if (v->next_gpa != addr || v->len != len) {
456                 v->next_gpa = addr;
457                 v->len = addr ? len : 0;
458                 v->update_pending = 1;
459         }
460         spin_unlock(&vcpu->arch.vpa_update_lock);
461         return 0;
462 }
463
464 /* Length for a per-processor buffer is passed in at offset 4 in the buffer */
465 struct reg_vpa {
466         u32 dummy;
467         union {
468                 __be16 hword;
469                 __be32 word;
470         } length;
471 };
472
473 static int vpa_is_registered(struct kvmppc_vpa *vpap)
474 {
475         if (vpap->update_pending)
476                 return vpap->next_gpa != 0;
477         return vpap->pinned_addr != NULL;
478 }
479
480 static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
481                                        unsigned long flags,
482                                        unsigned long vcpuid, unsigned long vpa)
483 {
484         struct kvm *kvm = vcpu->kvm;
485         unsigned long len, nb;
486         void *va;
487         struct kvm_vcpu *tvcpu;
488         int err;
489         int subfunc;
490         struct kvmppc_vpa *vpap;
491
492         tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
493         if (!tvcpu)
494                 return H_PARAMETER;
495
496         subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK;
497         if (subfunc == H_VPA_REG_VPA || subfunc == H_VPA_REG_DTL ||
498             subfunc == H_VPA_REG_SLB) {
499                 /* Registering new area - address must be cache-line aligned */
500                 if ((vpa & (L1_CACHE_BYTES - 1)) || !vpa)
501                         return H_PARAMETER;
502
503                 /* convert logical addr to kernel addr and read length */
504                 va = kvmppc_pin_guest_page(kvm, vpa, &nb);
505                 if (va == NULL)
506                         return H_PARAMETER;
507                 if (subfunc == H_VPA_REG_VPA)
508                         len = be16_to_cpu(((struct reg_vpa *)va)->length.hword);
509                 else
510                         len = be32_to_cpu(((struct reg_vpa *)va)->length.word);
511                 kvmppc_unpin_guest_page(kvm, va, vpa, false);
512
513                 /* Check length */
514                 if (len > nb || len < sizeof(struct reg_vpa))
515                         return H_PARAMETER;
516         } else {
517                 vpa = 0;
518                 len = 0;
519         }
520
521         err = H_PARAMETER;
522         vpap = NULL;
523         spin_lock(&tvcpu->arch.vpa_update_lock);
524
525         switch (subfunc) {
526         case H_VPA_REG_VPA:             /* register VPA */
527                 /*
528                  * The size of our lppaca is 1kB because of the way we align
529                  * it for the guest to avoid crossing a 4kB boundary. We only
530                  * use 640 bytes of the structure though, so we should accept
531                  * clients that set a size of 640.
532                  */
533                 BUILD_BUG_ON(sizeof(struct lppaca) != 640);
534                 if (len < sizeof(struct lppaca))
535                         break;
536                 vpap = &tvcpu->arch.vpa;
537                 err = 0;
538                 break;
539
540         case H_VPA_REG_DTL:             /* register DTL */
541                 if (len < sizeof(struct dtl_entry))
542                         break;
543                 len -= len % sizeof(struct dtl_entry);
544
545                 /* Check that they have previously registered a VPA */
546                 err = H_RESOURCE;
547                 if (!vpa_is_registered(&tvcpu->arch.vpa))
548                         break;
549
550                 vpap = &tvcpu->arch.dtl;
551                 err = 0;
552                 break;
553
554         case H_VPA_REG_SLB:             /* register SLB shadow buffer */
555                 /* Check that they have previously registered a VPA */
556                 err = H_RESOURCE;
557                 if (!vpa_is_registered(&tvcpu->arch.vpa))
558                         break;
559
560                 vpap = &tvcpu->arch.slb_shadow;
561                 err = 0;
562                 break;
563
564         case H_VPA_DEREG_VPA:           /* deregister VPA */
565                 /* Check they don't still have a DTL or SLB buf registered */
566                 err = H_RESOURCE;
567                 if (vpa_is_registered(&tvcpu->arch.dtl) ||
568                     vpa_is_registered(&tvcpu->arch.slb_shadow))
569                         break;
570
571                 vpap = &tvcpu->arch.vpa;
572                 err = 0;
573                 break;
574
575         case H_VPA_DEREG_DTL:           /* deregister DTL */
576                 vpap = &tvcpu->arch.dtl;
577                 err = 0;
578                 break;
579
580         case H_VPA_DEREG_SLB:           /* deregister SLB shadow buffer */
581                 vpap = &tvcpu->arch.slb_shadow;
582                 err = 0;
583                 break;
584         }
585
586         if (vpap) {
587                 vpap->next_gpa = vpa;
588                 vpap->len = len;
589                 vpap->update_pending = 1;
590         }
591
592         spin_unlock(&tvcpu->arch.vpa_update_lock);
593
594         return err;
595 }
596
597 static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
598 {
599         struct kvm *kvm = vcpu->kvm;
600         void *va;
601         unsigned long nb;
602         unsigned long gpa;
603
604         /*
605          * We need to pin the page pointed to by vpap->next_gpa,
606          * but we can't call kvmppc_pin_guest_page under the lock
607          * as it does get_user_pages() and down_read().  So we
608          * have to drop the lock, pin the page, then get the lock
609          * again and check that a new area didn't get registered
610          * in the meantime.
611          */
612         for (;;) {
613                 gpa = vpap->next_gpa;
614                 spin_unlock(&vcpu->arch.vpa_update_lock);
615                 va = NULL;
616                 nb = 0;
617                 if (gpa)
618                         va = kvmppc_pin_guest_page(kvm, gpa, &nb);
619                 spin_lock(&vcpu->arch.vpa_update_lock);
620                 if (gpa == vpap->next_gpa)
621                         break;
622                 /* sigh... unpin that one and try again */
623                 if (va)
624                         kvmppc_unpin_guest_page(kvm, va, gpa, false);
625         }
626
627         vpap->update_pending = 0;
628         if (va && nb < vpap->len) {
629                 /*
630                  * If it's now too short, it must be that userspace
631                  * has changed the mappings underlying guest memory,
632                  * so unregister the region.
633                  */
634                 kvmppc_unpin_guest_page(kvm, va, gpa, false);
635                 va = NULL;
636         }
637         if (vpap->pinned_addr)
638                 kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa,
639                                         vpap->dirty);
640         vpap->gpa = gpa;
641         vpap->pinned_addr = va;
642         vpap->dirty = false;
643         if (va)
644                 vpap->pinned_end = va + vpap->len;
645 }
646
647 static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
648 {
649         if (!(vcpu->arch.vpa.update_pending ||
650               vcpu->arch.slb_shadow.update_pending ||
651               vcpu->arch.dtl.update_pending))
652                 return;
653
654         spin_lock(&vcpu->arch.vpa_update_lock);
655         if (vcpu->arch.vpa.update_pending) {
656                 kvmppc_update_vpa(vcpu, &vcpu->arch.vpa);
657                 if (vcpu->arch.vpa.pinned_addr)
658                         init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
659         }
660         if (vcpu->arch.dtl.update_pending) {
661                 kvmppc_update_vpa(vcpu, &vcpu->arch.dtl);
662                 vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr;
663                 vcpu->arch.dtl_index = 0;
664         }
665         if (vcpu->arch.slb_shadow.update_pending)
666                 kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow);
667         spin_unlock(&vcpu->arch.vpa_update_lock);
668 }
669
670 /*
671  * Return the accumulated stolen time for the vcore up until `now'.
672  * The caller should hold the vcore lock.
673  */
674 static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
675 {
676         u64 p;
677         unsigned long flags;
678
679         spin_lock_irqsave(&vc->stoltb_lock, flags);
680         p = vc->stolen_tb;
681         if (vc->vcore_state != VCORE_INACTIVE &&
682             vc->preempt_tb != TB_NIL)
683                 p += now - vc->preempt_tb;
684         spin_unlock_irqrestore(&vc->stoltb_lock, flags);
685         return p;
686 }
687
688 static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
689                                     struct kvmppc_vcore *vc)
690 {
691         struct dtl_entry *dt;
692         struct lppaca *vpa;
693         unsigned long stolen;
694         unsigned long core_stolen;
695         u64 now;
696         unsigned long flags;
697
698         dt = vcpu->arch.dtl_ptr;
699         vpa = vcpu->arch.vpa.pinned_addr;
700         now = mftb();
701         core_stolen = vcore_stolen_time(vc, now);
702         stolen = core_stolen - vcpu->arch.stolen_logged;
703         vcpu->arch.stolen_logged = core_stolen;
704         spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
705         stolen += vcpu->arch.busy_stolen;
706         vcpu->arch.busy_stolen = 0;
707         spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
708         if (!dt || !vpa)
709                 return;
710         memset(dt, 0, sizeof(struct dtl_entry));
711         dt->dispatch_reason = 7;
712         dt->processor_id = cpu_to_be16(vc->pcpu + vcpu->arch.ptid);
713         dt->timebase = cpu_to_be64(now + vc->tb_offset);
714         dt->enqueue_to_dispatch_time = cpu_to_be32(stolen);
715         dt->srr0 = cpu_to_be64(kvmppc_get_pc(vcpu));
716         dt->srr1 = cpu_to_be64(vcpu->arch.shregs.msr);
717         ++dt;
718         if (dt == vcpu->arch.dtl.pinned_end)
719                 dt = vcpu->arch.dtl.pinned_addr;
720         vcpu->arch.dtl_ptr = dt;
721         /* order writing *dt vs. writing vpa->dtl_idx */
722         smp_wmb();
723         vpa->dtl_idx = cpu_to_be64(++vcpu->arch.dtl_index);
724         vcpu->arch.dtl.dirty = true;
725 }
726
727 /* See if there is a doorbell interrupt pending for a vcpu */
728 static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
729 {
730         int thr;
731         struct kvmppc_vcore *vc;
732
733         if (vcpu->arch.doorbell_request)
734                 return true;
735         /*
736          * Ensure that the read of vcore->dpdes comes after the read
737          * of vcpu->doorbell_request.  This barrier matches the
738          * smp_wmb() in kvmppc_guest_entry_inject().
739          */
740         smp_rmb();
741         vc = vcpu->arch.vcore;
742         thr = vcpu->vcpu_id - vc->first_vcpuid;
743         return !!(vc->dpdes & (1 << thr));
744 }
745
746 static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu)
747 {
748         if (vcpu->arch.vcore->arch_compat >= PVR_ARCH_207)
749                 return true;
750         if ((!vcpu->arch.vcore->arch_compat) &&
751             cpu_has_feature(CPU_FTR_ARCH_207S))
752                 return true;
753         return false;
754 }
755
756 static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags,
757                              unsigned long resource, unsigned long value1,
758                              unsigned long value2)
759 {
760         switch (resource) {
761         case H_SET_MODE_RESOURCE_SET_CIABR:
762                 if (!kvmppc_power8_compatible(vcpu))
763                         return H_P2;
764                 if (value2)
765                         return H_P4;
766                 if (mflags)
767                         return H_UNSUPPORTED_FLAG_START;
768                 /* Guests can't breakpoint the hypervisor */
769                 if ((value1 & CIABR_PRIV) == CIABR_PRIV_HYPER)
770                         return H_P3;
771                 vcpu->arch.ciabr  = value1;
772                 return H_SUCCESS;
773         case H_SET_MODE_RESOURCE_SET_DAWR0:
774                 if (!kvmppc_power8_compatible(vcpu))
775                         return H_P2;
776                 if (!ppc_breakpoint_available())
777                         return H_P2;
778                 if (mflags)
779                         return H_UNSUPPORTED_FLAG_START;
780                 if (value2 & DABRX_HYP)
781                         return H_P4;
782                 vcpu->arch.dawr0  = value1;
783                 vcpu->arch.dawrx0 = value2;
784                 return H_SUCCESS;
785         case H_SET_MODE_RESOURCE_SET_DAWR1:
786                 if (!kvmppc_power8_compatible(vcpu))
787                         return H_P2;
788                 if (!ppc_breakpoint_available())
789                         return H_P2;
790                 if (!cpu_has_feature(CPU_FTR_DAWR1))
791                         return H_P2;
792                 if (!vcpu->kvm->arch.dawr1_enabled)
793                         return H_FUNCTION;
794                 if (mflags)
795                         return H_UNSUPPORTED_FLAG_START;
796                 if (value2 & DABRX_HYP)
797                         return H_P4;
798                 vcpu->arch.dawr1  = value1;
799                 vcpu->arch.dawrx1 = value2;
800                 return H_SUCCESS;
801         case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE:
802                 /*
803                  * KVM does not support mflags=2 (AIL=2) and AIL=1 is reserved.
804                  * Keep this in synch with kvmppc_filter_guest_lpcr_hv.
805                  */
806                 if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) &&
807                                 kvmhv_vcpu_is_radix(vcpu) && mflags == 3)
808                         return H_UNSUPPORTED_FLAG_START;
809                 return H_TOO_HARD;
810         default:
811                 return H_TOO_HARD;
812         }
813 }
814
815 /* Copy guest memory in place - must reside within a single memslot */
816 static int kvmppc_copy_guest(struct kvm *kvm, gpa_t to, gpa_t from,
817                                   unsigned long len)
818 {
819         struct kvm_memory_slot *to_memslot = NULL;
820         struct kvm_memory_slot *from_memslot = NULL;
821         unsigned long to_addr, from_addr;
822         int r;
823
824         /* Get HPA for from address */
825         from_memslot = gfn_to_memslot(kvm, from >> PAGE_SHIFT);
826         if (!from_memslot)
827                 return -EFAULT;
828         if ((from + len) >= ((from_memslot->base_gfn + from_memslot->npages)
829                              << PAGE_SHIFT))
830                 return -EINVAL;
831         from_addr = gfn_to_hva_memslot(from_memslot, from >> PAGE_SHIFT);
832         if (kvm_is_error_hva(from_addr))
833                 return -EFAULT;
834         from_addr |= (from & (PAGE_SIZE - 1));
835
836         /* Get HPA for to address */
837         to_memslot = gfn_to_memslot(kvm, to >> PAGE_SHIFT);
838         if (!to_memslot)
839                 return -EFAULT;
840         if ((to + len) >= ((to_memslot->base_gfn + to_memslot->npages)
841                            << PAGE_SHIFT))
842                 return -EINVAL;
843         to_addr = gfn_to_hva_memslot(to_memslot, to >> PAGE_SHIFT);
844         if (kvm_is_error_hva(to_addr))
845                 return -EFAULT;
846         to_addr |= (to & (PAGE_SIZE - 1));
847
848         /* Perform copy */
849         r = raw_copy_in_user((void __user *)to_addr, (void __user *)from_addr,
850                              len);
851         if (r)
852                 return -EFAULT;
853         mark_page_dirty(kvm, to >> PAGE_SHIFT);
854         return 0;
855 }
856
857 static long kvmppc_h_page_init(struct kvm_vcpu *vcpu, unsigned long flags,
858                                unsigned long dest, unsigned long src)
859 {
860         u64 pg_sz = SZ_4K;              /* 4K page size */
861         u64 pg_mask = SZ_4K - 1;
862         int ret;
863
864         /* Check for invalid flags (H_PAGE_SET_LOANED covers all CMO flags) */
865         if (flags & ~(H_ICACHE_INVALIDATE | H_ICACHE_SYNCHRONIZE |
866                       H_ZERO_PAGE | H_COPY_PAGE | H_PAGE_SET_LOANED))
867                 return H_PARAMETER;
868
869         /* dest (and src if copy_page flag set) must be page aligned */
870         if ((dest & pg_mask) || ((flags & H_COPY_PAGE) && (src & pg_mask)))
871                 return H_PARAMETER;
872
873         /* zero and/or copy the page as determined by the flags */
874         if (flags & H_COPY_PAGE) {
875                 ret = kvmppc_copy_guest(vcpu->kvm, dest, src, pg_sz);
876                 if (ret < 0)
877                         return H_PARAMETER;
878         } else if (flags & H_ZERO_PAGE) {
879                 ret = kvm_clear_guest(vcpu->kvm, dest, pg_sz);
880                 if (ret < 0)
881                         return H_PARAMETER;
882         }
883
884         /* We can ignore the remaining flags */
885
886         return H_SUCCESS;
887 }
888
889 static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
890 {
891         struct kvmppc_vcore *vcore = target->arch.vcore;
892
893         /*
894          * We expect to have been called by the real mode handler
895          * (kvmppc_rm_h_confer()) which would have directly returned
896          * H_SUCCESS if the source vcore wasn't idle (e.g. if it may
897          * have useful work to do and should not confer) so we don't
898          * recheck that here.
899          *
900          * In the case of the P9 single vcpu per vcore case, the real
901          * mode handler is not called but no other threads are in the
902          * source vcore.
903          */
904
905         spin_lock(&vcore->lock);
906         if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
907             vcore->vcore_state != VCORE_INACTIVE &&
908             vcore->runner)
909                 target = vcore->runner;
910         spin_unlock(&vcore->lock);
911
912         return kvm_vcpu_yield_to(target);
913 }
914
915 static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu)
916 {
917         int yield_count = 0;
918         struct lppaca *lppaca;
919
920         spin_lock(&vcpu->arch.vpa_update_lock);
921         lppaca = (struct lppaca *)vcpu->arch.vpa.pinned_addr;
922         if (lppaca)
923                 yield_count = be32_to_cpu(lppaca->yield_count);
924         spin_unlock(&vcpu->arch.vpa_update_lock);
925         return yield_count;
926 }
927
928 int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
929 {
930         struct kvm *kvm = vcpu->kvm;
931         unsigned long req = kvmppc_get_gpr(vcpu, 3);
932         unsigned long target, ret = H_SUCCESS;
933         int yield_count;
934         struct kvm_vcpu *tvcpu;
935         int idx, rc;
936
937         if (req <= MAX_HCALL_OPCODE &&
938             !test_bit(req/4, vcpu->kvm->arch.enabled_hcalls))
939                 return RESUME_HOST;
940
941         switch (req) {
942         case H_REMOVE:
943                 ret = kvmppc_h_remove(vcpu, kvmppc_get_gpr(vcpu, 4),
944                                         kvmppc_get_gpr(vcpu, 5),
945                                         kvmppc_get_gpr(vcpu, 6));
946                 if (ret == H_TOO_HARD)
947                         return RESUME_HOST;
948                 break;
949         case H_ENTER:
950                 ret = kvmppc_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4),
951                                         kvmppc_get_gpr(vcpu, 5),
952                                         kvmppc_get_gpr(vcpu, 6),
953                                         kvmppc_get_gpr(vcpu, 7));
954                 if (ret == H_TOO_HARD)
955                         return RESUME_HOST;
956                 break;
957         case H_READ:
958                 ret = kvmppc_h_read(vcpu, kvmppc_get_gpr(vcpu, 4),
959                                         kvmppc_get_gpr(vcpu, 5));
960                 if (ret == H_TOO_HARD)
961                         return RESUME_HOST;
962                 break;
963         case H_CLEAR_MOD:
964                 ret = kvmppc_h_clear_mod(vcpu, kvmppc_get_gpr(vcpu, 4),
965                                         kvmppc_get_gpr(vcpu, 5));
966                 if (ret == H_TOO_HARD)
967                         return RESUME_HOST;
968                 break;
969         case H_CLEAR_REF:
970                 ret = kvmppc_h_clear_ref(vcpu, kvmppc_get_gpr(vcpu, 4),
971                                         kvmppc_get_gpr(vcpu, 5));
972                 if (ret == H_TOO_HARD)
973                         return RESUME_HOST;
974                 break;
975         case H_PROTECT:
976                 ret = kvmppc_h_protect(vcpu, kvmppc_get_gpr(vcpu, 4),
977                                         kvmppc_get_gpr(vcpu, 5),
978                                         kvmppc_get_gpr(vcpu, 6));
979                 if (ret == H_TOO_HARD)
980                         return RESUME_HOST;
981                 break;
982         case H_BULK_REMOVE:
983                 ret = kvmppc_h_bulk_remove(vcpu);
984                 if (ret == H_TOO_HARD)
985                         return RESUME_HOST;
986                 break;
987
988         case H_CEDE:
989                 break;
990         case H_PROD:
991                 target = kvmppc_get_gpr(vcpu, 4);
992                 tvcpu = kvmppc_find_vcpu(kvm, target);
993                 if (!tvcpu) {
994                         ret = H_PARAMETER;
995                         break;
996                 }
997                 tvcpu->arch.prodded = 1;
998                 smp_mb();
999                 if (tvcpu->arch.ceded)
1000                         kvmppc_fast_vcpu_kick_hv(tvcpu);
1001                 break;
1002         case H_CONFER:
1003                 target = kvmppc_get_gpr(vcpu, 4);
1004                 if (target == -1)
1005                         break;
1006                 tvcpu = kvmppc_find_vcpu(kvm, target);
1007                 if (!tvcpu) {
1008                         ret = H_PARAMETER;
1009                         break;
1010                 }
1011                 yield_count = kvmppc_get_gpr(vcpu, 5);
1012                 if (kvmppc_get_yield_count(tvcpu) != yield_count)
1013                         break;
1014                 kvm_arch_vcpu_yield_to(tvcpu);
1015                 break;
1016         case H_REGISTER_VPA:
1017                 ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
1018                                         kvmppc_get_gpr(vcpu, 5),
1019                                         kvmppc_get_gpr(vcpu, 6));
1020                 break;
1021         case H_RTAS:
1022                 if (list_empty(&kvm->arch.rtas_tokens))
1023                         return RESUME_HOST;
1024
1025                 idx = srcu_read_lock(&kvm->srcu);
1026                 rc = kvmppc_rtas_hcall(vcpu);
1027                 srcu_read_unlock(&kvm->srcu, idx);
1028
1029                 if (rc == -ENOENT)
1030                         return RESUME_HOST;
1031                 else if (rc == 0)
1032                         break;
1033
1034                 /* Send the error out to userspace via KVM_RUN */
1035                 return rc;
1036         case H_LOGICAL_CI_LOAD:
1037                 ret = kvmppc_h_logical_ci_load(vcpu);
1038                 if (ret == H_TOO_HARD)
1039                         return RESUME_HOST;
1040                 break;
1041         case H_LOGICAL_CI_STORE:
1042                 ret = kvmppc_h_logical_ci_store(vcpu);
1043                 if (ret == H_TOO_HARD)
1044                         return RESUME_HOST;
1045                 break;
1046         case H_SET_MODE:
1047                 ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4),
1048                                         kvmppc_get_gpr(vcpu, 5),
1049                                         kvmppc_get_gpr(vcpu, 6),
1050                                         kvmppc_get_gpr(vcpu, 7));
1051                 if (ret == H_TOO_HARD)
1052                         return RESUME_HOST;
1053                 break;
1054         case H_XIRR:
1055         case H_CPPR:
1056         case H_EOI:
1057         case H_IPI:
1058         case H_IPOLL:
1059         case H_XIRR_X:
1060                 if (kvmppc_xics_enabled(vcpu)) {
1061                         if (xics_on_xive()) {
1062                                 ret = H_NOT_AVAILABLE;
1063                                 return RESUME_GUEST;
1064                         }
1065                         ret = kvmppc_xics_hcall(vcpu, req);
1066                         break;
1067                 }
1068                 return RESUME_HOST;
1069         case H_SET_DABR:
1070                 ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, 4));
1071                 break;
1072         case H_SET_XDABR:
1073                 ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4),
1074                                                 kvmppc_get_gpr(vcpu, 5));
1075                 break;
1076 #ifdef CONFIG_SPAPR_TCE_IOMMU
1077         case H_GET_TCE:
1078                 ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
1079                                                 kvmppc_get_gpr(vcpu, 5));
1080                 if (ret == H_TOO_HARD)
1081                         return RESUME_HOST;
1082                 break;
1083         case H_PUT_TCE:
1084                 ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
1085                                                 kvmppc_get_gpr(vcpu, 5),
1086                                                 kvmppc_get_gpr(vcpu, 6));
1087                 if (ret == H_TOO_HARD)
1088                         return RESUME_HOST;
1089                 break;
1090         case H_PUT_TCE_INDIRECT:
1091                 ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
1092                                                 kvmppc_get_gpr(vcpu, 5),
1093                                                 kvmppc_get_gpr(vcpu, 6),
1094                                                 kvmppc_get_gpr(vcpu, 7));
1095                 if (ret == H_TOO_HARD)
1096                         return RESUME_HOST;
1097                 break;
1098         case H_STUFF_TCE:
1099                 ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
1100                                                 kvmppc_get_gpr(vcpu, 5),
1101                                                 kvmppc_get_gpr(vcpu, 6),
1102                                                 kvmppc_get_gpr(vcpu, 7));
1103                 if (ret == H_TOO_HARD)
1104                         return RESUME_HOST;
1105                 break;
1106 #endif
1107         case H_RANDOM:
1108                 if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4]))
1109                         ret = H_HARDWARE;
1110                 break;
1111
1112         case H_SET_PARTITION_TABLE:
1113                 ret = H_FUNCTION;
1114                 if (nesting_enabled(kvm))
1115                         ret = kvmhv_set_partition_table(vcpu);
1116                 break;
1117         case H_ENTER_NESTED:
1118                 ret = H_FUNCTION;
1119                 if (!nesting_enabled(kvm))
1120                         break;
1121                 ret = kvmhv_enter_nested_guest(vcpu);
1122                 if (ret == H_INTERRUPT) {
1123                         kvmppc_set_gpr(vcpu, 3, 0);
1124                         vcpu->arch.hcall_needed = 0;
1125                         return -EINTR;
1126                 } else if (ret == H_TOO_HARD) {
1127                         kvmppc_set_gpr(vcpu, 3, 0);
1128                         vcpu->arch.hcall_needed = 0;
1129                         return RESUME_HOST;
1130                 }
1131                 break;
1132         case H_TLB_INVALIDATE:
1133                 ret = H_FUNCTION;
1134                 if (nesting_enabled(kvm))
1135                         ret = kvmhv_do_nested_tlbie(vcpu);
1136                 break;
1137         case H_COPY_TOFROM_GUEST:
1138                 ret = H_FUNCTION;
1139                 if (nesting_enabled(kvm))
1140                         ret = kvmhv_copy_tofrom_guest_nested(vcpu);
1141                 break;
1142         case H_PAGE_INIT:
1143                 ret = kvmppc_h_page_init(vcpu, kvmppc_get_gpr(vcpu, 4),
1144                                          kvmppc_get_gpr(vcpu, 5),
1145                                          kvmppc_get_gpr(vcpu, 6));
1146                 break;
1147         case H_SVM_PAGE_IN:
1148                 ret = H_UNSUPPORTED;
1149                 if (kvmppc_get_srr1(vcpu) & MSR_S)
1150                         ret = kvmppc_h_svm_page_in(kvm,
1151                                                    kvmppc_get_gpr(vcpu, 4),
1152                                                    kvmppc_get_gpr(vcpu, 5),
1153                                                    kvmppc_get_gpr(vcpu, 6));
1154                 break;
1155         case H_SVM_PAGE_OUT:
1156                 ret = H_UNSUPPORTED;
1157                 if (kvmppc_get_srr1(vcpu) & MSR_S)
1158                         ret = kvmppc_h_svm_page_out(kvm,
1159                                                     kvmppc_get_gpr(vcpu, 4),
1160                                                     kvmppc_get_gpr(vcpu, 5),
1161                                                     kvmppc_get_gpr(vcpu, 6));
1162                 break;
1163         case H_SVM_INIT_START:
1164                 ret = H_UNSUPPORTED;
1165                 if (kvmppc_get_srr1(vcpu) & MSR_S)
1166                         ret = kvmppc_h_svm_init_start(kvm);
1167                 break;
1168         case H_SVM_INIT_DONE:
1169                 ret = H_UNSUPPORTED;
1170                 if (kvmppc_get_srr1(vcpu) & MSR_S)
1171                         ret = kvmppc_h_svm_init_done(kvm);
1172                 break;
1173         case H_SVM_INIT_ABORT:
1174                 /*
1175                  * Even if that call is made by the Ultravisor, the SSR1 value
1176                  * is the guest context one, with the secure bit clear as it has
1177                  * not yet been secured. So we can't check it here.
1178                  * Instead the kvm->arch.secure_guest flag is checked inside
1179                  * kvmppc_h_svm_init_abort().
1180                  */
1181                 ret = kvmppc_h_svm_init_abort(kvm);
1182                 break;
1183
1184         default:
1185                 return RESUME_HOST;
1186         }
1187         WARN_ON_ONCE(ret == H_TOO_HARD);
1188         kvmppc_set_gpr(vcpu, 3, ret);
1189         vcpu->arch.hcall_needed = 0;
1190         return RESUME_GUEST;
1191 }
1192
1193 /*
1194  * Handle H_CEDE in the P9 path where we don't call the real-mode hcall
1195  * handlers in book3s_hv_rmhandlers.S.
1196  *
1197  * This has to be done early, not in kvmppc_pseries_do_hcall(), so
1198  * that the cede logic in kvmppc_run_single_vcpu() works properly.
1199  */
1200 static void kvmppc_cede(struct kvm_vcpu *vcpu)
1201 {
1202         vcpu->arch.shregs.msr |= MSR_EE;
1203         vcpu->arch.ceded = 1;
1204         smp_mb();
1205         if (vcpu->arch.prodded) {
1206                 vcpu->arch.prodded = 0;
1207                 smp_mb();
1208                 vcpu->arch.ceded = 0;
1209         }
1210 }
1211
1212 static int kvmppc_hcall_impl_hv(unsigned long cmd)
1213 {
1214         switch (cmd) {
1215         case H_CEDE:
1216         case H_PROD:
1217         case H_CONFER:
1218         case H_REGISTER_VPA:
1219         case H_SET_MODE:
1220         case H_LOGICAL_CI_LOAD:
1221         case H_LOGICAL_CI_STORE:
1222 #ifdef CONFIG_KVM_XICS
1223         case H_XIRR:
1224         case H_CPPR:
1225         case H_EOI:
1226         case H_IPI:
1227         case H_IPOLL:
1228         case H_XIRR_X:
1229 #endif
1230         case H_PAGE_INIT:
1231                 return 1;
1232         }
1233
1234         /* See if it's in the real-mode table */
1235         return kvmppc_hcall_impl_hv_realmode(cmd);
1236 }
1237
1238 static int kvmppc_emulate_debug_inst(struct kvm_vcpu *vcpu)
1239 {
1240         u32 last_inst;
1241
1242         if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
1243                                         EMULATE_DONE) {
1244                 /*
1245                  * Fetch failed, so return to guest and
1246                  * try executing it again.
1247                  */
1248                 return RESUME_GUEST;
1249         }
1250
1251         if (last_inst == KVMPPC_INST_SW_BREAKPOINT) {
1252                 vcpu->run->exit_reason = KVM_EXIT_DEBUG;
1253                 vcpu->run->debug.arch.address = kvmppc_get_pc(vcpu);
1254                 return RESUME_HOST;
1255         } else {
1256                 kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
1257                 return RESUME_GUEST;
1258         }
1259 }
1260
1261 static void do_nothing(void *x)
1262 {
1263 }
1264
1265 static unsigned long kvmppc_read_dpdes(struct kvm_vcpu *vcpu)
1266 {
1267         int thr, cpu, pcpu, nthreads;
1268         struct kvm_vcpu *v;
1269         unsigned long dpdes;
1270
1271         nthreads = vcpu->kvm->arch.emul_smt_mode;
1272         dpdes = 0;
1273         cpu = vcpu->vcpu_id & ~(nthreads - 1);
1274         for (thr = 0; thr < nthreads; ++thr, ++cpu) {
1275                 v = kvmppc_find_vcpu(vcpu->kvm, cpu);
1276                 if (!v)
1277                         continue;
1278                 /*
1279                  * If the vcpu is currently running on a physical cpu thread,
1280                  * interrupt it in order to pull it out of the guest briefly,
1281                  * which will update its vcore->dpdes value.
1282                  */
1283                 pcpu = READ_ONCE(v->cpu);
1284                 if (pcpu >= 0)
1285                         smp_call_function_single(pcpu, do_nothing, NULL, 1);
1286                 if (kvmppc_doorbell_pending(v))
1287                         dpdes |= 1 << thr;
1288         }
1289         return dpdes;
1290 }
1291
1292 /*
1293  * On POWER9, emulate doorbell-related instructions in order to
1294  * give the guest the illusion of running on a multi-threaded core.
1295  * The instructions emulated are msgsndp, msgclrp, mfspr TIR,
1296  * and mfspr DPDES.
1297  */
1298 static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
1299 {
1300         u32 inst, rb, thr;
1301         unsigned long arg;
1302         struct kvm *kvm = vcpu->kvm;
1303         struct kvm_vcpu *tvcpu;
1304
1305         if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst) != EMULATE_DONE)
1306                 return RESUME_GUEST;
1307         if (get_op(inst) != 31)
1308                 return EMULATE_FAIL;
1309         rb = get_rb(inst);
1310         thr = vcpu->vcpu_id & (kvm->arch.emul_smt_mode - 1);
1311         switch (get_xop(inst)) {
1312         case OP_31_XOP_MSGSNDP:
1313                 arg = kvmppc_get_gpr(vcpu, rb);
1314                 if (((arg >> 27) & 0x1f) != PPC_DBELL_SERVER)
1315                         break;
1316                 arg &= 0x7f;
1317                 if (arg >= kvm->arch.emul_smt_mode)
1318                         break;
1319                 tvcpu = kvmppc_find_vcpu(kvm, vcpu->vcpu_id - thr + arg);
1320                 if (!tvcpu)
1321                         break;
1322                 if (!tvcpu->arch.doorbell_request) {
1323                         tvcpu->arch.doorbell_request = 1;
1324                         kvmppc_fast_vcpu_kick_hv(tvcpu);
1325                 }
1326                 break;
1327         case OP_31_XOP_MSGCLRP:
1328                 arg = kvmppc_get_gpr(vcpu, rb);
1329                 if (((arg >> 27) & 0x1f) != PPC_DBELL_SERVER)
1330                         break;
1331                 vcpu->arch.vcore->dpdes = 0;
1332                 vcpu->arch.doorbell_request = 0;
1333                 break;
1334         case OP_31_XOP_MFSPR:
1335                 switch (get_sprn(inst)) {
1336                 case SPRN_TIR:
1337                         arg = thr;
1338                         break;
1339                 case SPRN_DPDES:
1340                         arg = kvmppc_read_dpdes(vcpu);
1341                         break;
1342                 default:
1343                         return EMULATE_FAIL;
1344                 }
1345                 kvmppc_set_gpr(vcpu, get_rt(inst), arg);
1346                 break;
1347         default:
1348                 return EMULATE_FAIL;
1349         }
1350         kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
1351         return RESUME_GUEST;
1352 }
1353
1354 static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
1355                                  struct task_struct *tsk)
1356 {
1357         struct kvm_run *run = vcpu->run;
1358         int r = RESUME_HOST;
1359
1360         vcpu->stat.sum_exits++;
1361
1362         /*
1363          * This can happen if an interrupt occurs in the last stages
1364          * of guest entry or the first stages of guest exit (i.e. after
1365          * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
1366          * and before setting it to KVM_GUEST_MODE_HOST_HV).
1367          * That can happen due to a bug, or due to a machine check
1368          * occurring at just the wrong time.
1369          */
1370         if (vcpu->arch.shregs.msr & MSR_HV) {
1371                 printk(KERN_EMERG "KVM trap in HV mode!\n");
1372                 printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
1373                         vcpu->arch.trap, kvmppc_get_pc(vcpu),
1374                         vcpu->arch.shregs.msr);
1375                 kvmppc_dump_regs(vcpu);
1376                 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
1377                 run->hw.hardware_exit_reason = vcpu->arch.trap;
1378                 return RESUME_HOST;
1379         }
1380         run->exit_reason = KVM_EXIT_UNKNOWN;
1381         run->ready_for_interrupt_injection = 1;
1382         switch (vcpu->arch.trap) {
1383         /* We're good on these - the host merely wanted to get our attention */
1384         case BOOK3S_INTERRUPT_HV_DECREMENTER:
1385                 vcpu->stat.dec_exits++;
1386                 r = RESUME_GUEST;
1387                 break;
1388         case BOOK3S_INTERRUPT_EXTERNAL:
1389         case BOOK3S_INTERRUPT_H_DOORBELL:
1390         case BOOK3S_INTERRUPT_H_VIRT:
1391                 vcpu->stat.ext_intr_exits++;
1392                 r = RESUME_GUEST;
1393                 break;
1394         /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
1395         case BOOK3S_INTERRUPT_HMI:
1396         case BOOK3S_INTERRUPT_PERFMON:
1397         case BOOK3S_INTERRUPT_SYSTEM_RESET:
1398                 r = RESUME_GUEST;
1399                 break;
1400         case BOOK3S_INTERRUPT_MACHINE_CHECK: {
1401                 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
1402                                               DEFAULT_RATELIMIT_BURST);
1403                 /*
1404                  * Print the MCE event to host console. Ratelimit so the guest
1405                  * can't flood the host log.
1406                  */
1407                 if (__ratelimit(&rs))
1408                         machine_check_print_event_info(&vcpu->arch.mce_evt,false, true);
1409
1410                 /*
1411                  * If the guest can do FWNMI, exit to userspace so it can
1412                  * deliver a FWNMI to the guest.
1413                  * Otherwise we synthesize a machine check for the guest
1414                  * so that it knows that the machine check occurred.
1415                  */
1416                 if (!vcpu->kvm->arch.fwnmi_enabled) {
1417                         ulong flags = vcpu->arch.shregs.msr & 0x083c0000;
1418                         kvmppc_core_queue_machine_check(vcpu, flags);
1419                         r = RESUME_GUEST;
1420                         break;
1421                 }
1422
1423                 /* Exit to guest with KVM_EXIT_NMI as exit reason */
1424                 run->exit_reason = KVM_EXIT_NMI;
1425                 run->hw.hardware_exit_reason = vcpu->arch.trap;
1426                 /* Clear out the old NMI status from run->flags */
1427                 run->flags &= ~KVM_RUN_PPC_NMI_DISP_MASK;
1428                 /* Now set the NMI status */
1429                 if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED)
1430                         run->flags |= KVM_RUN_PPC_NMI_DISP_FULLY_RECOV;
1431                 else
1432                         run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV;
1433
1434                 r = RESUME_HOST;
1435                 break;
1436         }
1437         case BOOK3S_INTERRUPT_PROGRAM:
1438         {
1439                 ulong flags;
1440                 /*
1441                  * Normally program interrupts are delivered directly
1442                  * to the guest by the hardware, but we can get here
1443                  * as a result of a hypervisor emulation interrupt
1444                  * (e40) getting turned into a 700 by BML RTAS.
1445                  */
1446                 flags = vcpu->arch.shregs.msr & 0x1f0000ull;
1447                 kvmppc_core_queue_program(vcpu, flags);
1448                 r = RESUME_GUEST;
1449                 break;
1450         }
1451         case BOOK3S_INTERRUPT_SYSCALL:
1452         {
1453                 int i;
1454
1455                 if (unlikely(vcpu->arch.shregs.msr & MSR_PR)) {
1456                         /*
1457                          * Guest userspace executed sc 1. This can only be
1458                          * reached by the P9 path because the old path
1459                          * handles this case in realmode hcall handlers.
1460                          */
1461                         if (!kvmhv_vcpu_is_radix(vcpu)) {
1462                                 /*
1463                                  * A guest could be running PR KVM, so this
1464                                  * may be a PR KVM hcall. It must be reflected
1465                                  * to the guest kernel as a sc interrupt.
1466                                  */
1467                                 kvmppc_core_queue_syscall(vcpu);
1468                         } else {
1469                                 /*
1470                                  * Radix guests can not run PR KVM or nested HV
1471                                  * hash guests which might run PR KVM, so this
1472                                  * is always a privilege fault. Send a program
1473                                  * check to guest kernel.
1474                                  */
1475                                 kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
1476                         }
1477                         r = RESUME_GUEST;
1478                         break;
1479                 }
1480
1481                 /*
1482                  * hcall - gather args and set exit_reason. This will next be
1483                  * handled by kvmppc_pseries_do_hcall which may be able to deal
1484                  * with it and resume guest, or may punt to userspace.
1485                  */
1486                 run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
1487                 for (i = 0; i < 9; ++i)
1488                         run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
1489                 run->exit_reason = KVM_EXIT_PAPR_HCALL;
1490                 vcpu->arch.hcall_needed = 1;
1491                 r = RESUME_HOST;
1492                 break;
1493         }
1494         /*
1495          * We get these next two if the guest accesses a page which it thinks
1496          * it has mapped but which is not actually present, either because
1497          * it is for an emulated I/O device or because the corresonding
1498          * host page has been paged out.
1499          *
1500          * Any other HDSI/HISI interrupts have been handled already for P7/8
1501          * guests. For POWER9 hash guests not using rmhandlers, basic hash
1502          * fault handling is done here.
1503          */
1504         case BOOK3S_INTERRUPT_H_DATA_STORAGE: {
1505                 unsigned long vsid;
1506                 long err;
1507
1508                 if (vcpu->arch.fault_dsisr == HDSISR_CANARY) {
1509                         r = RESUME_GUEST; /* Just retry if it's the canary */
1510                         break;
1511                 }
1512
1513                 if (kvm_is_radix(vcpu->kvm) || !cpu_has_feature(CPU_FTR_ARCH_300)) {
1514                         /*
1515                          * Radix doesn't require anything, and pre-ISAv3.0 hash
1516                          * already attempted to handle this in rmhandlers. The
1517                          * hash fault handling below is v3 only (it uses ASDR
1518                          * via fault_gpa).
1519                          */
1520                         r = RESUME_PAGE_FAULT;
1521                         break;
1522                 }
1523
1524                 if (!(vcpu->arch.fault_dsisr & (DSISR_NOHPTE | DSISR_PROTFAULT))) {
1525                         kvmppc_core_queue_data_storage(vcpu,
1526                                 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
1527                         r = RESUME_GUEST;
1528                         break;
1529                 }
1530
1531                 if (!(vcpu->arch.shregs.msr & MSR_DR))
1532                         vsid = vcpu->kvm->arch.vrma_slb_v;
1533                 else
1534                         vsid = vcpu->arch.fault_gpa;
1535
1536                 err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
1537                                 vsid, vcpu->arch.fault_dsisr, true);
1538                 if (err == 0) {
1539                         r = RESUME_GUEST;
1540                 } else if (err == -1 || err == -2) {
1541                         r = RESUME_PAGE_FAULT;
1542                 } else {
1543                         kvmppc_core_queue_data_storage(vcpu,
1544                                 vcpu->arch.fault_dar, err);
1545                         r = RESUME_GUEST;
1546                 }
1547                 break;
1548         }
1549         case BOOK3S_INTERRUPT_H_INST_STORAGE: {
1550                 unsigned long vsid;
1551                 long err;
1552
1553                 vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
1554                 vcpu->arch.fault_dsisr = vcpu->arch.shregs.msr &
1555                         DSISR_SRR1_MATCH_64S;
1556                 if (kvm_is_radix(vcpu->kvm) || !cpu_has_feature(CPU_FTR_ARCH_300)) {
1557                         /*
1558                          * Radix doesn't require anything, and pre-ISAv3.0 hash
1559                          * already attempted to handle this in rmhandlers. The
1560                          * hash fault handling below is v3 only (it uses ASDR
1561                          * via fault_gpa).
1562                          */
1563                         if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
1564                                 vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
1565                         r = RESUME_PAGE_FAULT;
1566                         break;
1567                 }
1568
1569                 if (!(vcpu->arch.fault_dsisr & SRR1_ISI_NOPT)) {
1570                         kvmppc_core_queue_inst_storage(vcpu,
1571                                 vcpu->arch.fault_dsisr);
1572                         r = RESUME_GUEST;
1573                         break;
1574                 }
1575
1576                 if (!(vcpu->arch.shregs.msr & MSR_IR))
1577                         vsid = vcpu->kvm->arch.vrma_slb_v;
1578                 else
1579                         vsid = vcpu->arch.fault_gpa;
1580
1581                 err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
1582                                 vsid, vcpu->arch.fault_dsisr, false);
1583                 if (err == 0) {
1584                         r = RESUME_GUEST;
1585                 } else if (err == -1) {
1586                         r = RESUME_PAGE_FAULT;
1587                 } else {
1588                         kvmppc_core_queue_inst_storage(vcpu, err);
1589                         r = RESUME_GUEST;
1590                 }
1591                 break;
1592         }
1593
1594         /*
1595          * This occurs if the guest executes an illegal instruction.
1596          * If the guest debug is disabled, generate a program interrupt
1597          * to the guest. If guest debug is enabled, we need to check
1598          * whether the instruction is a software breakpoint instruction.
1599          * Accordingly return to Guest or Host.
1600          */
1601         case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
1602                 if (vcpu->arch.emul_inst != KVM_INST_FETCH_FAILED)
1603                         vcpu->arch.last_inst = kvmppc_need_byteswap(vcpu) ?
1604                                 swab32(vcpu->arch.emul_inst) :
1605                                 vcpu->arch.emul_inst;
1606                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
1607                         r = kvmppc_emulate_debug_inst(vcpu);
1608                 } else {
1609                         kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
1610                         r = RESUME_GUEST;
1611                 }
1612                 break;
1613         /*
1614          * This occurs if the guest (kernel or userspace), does something that
1615          * is prohibited by HFSCR.
1616          * On POWER9, this could be a doorbell instruction that we need
1617          * to emulate.
1618          * Otherwise, we just generate a program interrupt to the guest.
1619          */
1620         case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
1621                 r = EMULATE_FAIL;
1622                 if (((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) &&
1623                     cpu_has_feature(CPU_FTR_ARCH_300))
1624                         r = kvmppc_emulate_doorbell_instr(vcpu);
1625                 if (r == EMULATE_FAIL) {
1626                         kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
1627                         r = RESUME_GUEST;
1628                 }
1629                 break;
1630
1631 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
1632         case BOOK3S_INTERRUPT_HV_SOFTPATCH:
1633                 /*
1634                  * This occurs for various TM-related instructions that
1635                  * we need to emulate on POWER9 DD2.2.  We have already
1636                  * handled the cases where the guest was in real-suspend
1637                  * mode and was transitioning to transactional state.
1638                  */
1639                 r = kvmhv_p9_tm_emulation(vcpu);
1640                 break;
1641 #endif
1642
1643         case BOOK3S_INTERRUPT_HV_RM_HARD:
1644                 r = RESUME_PASSTHROUGH;
1645                 break;
1646         default:
1647                 kvmppc_dump_regs(vcpu);
1648                 printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
1649                         vcpu->arch.trap, kvmppc_get_pc(vcpu),
1650                         vcpu->arch.shregs.msr);
1651                 run->hw.hardware_exit_reason = vcpu->arch.trap;
1652                 r = RESUME_HOST;
1653                 break;
1654         }
1655
1656         return r;
1657 }
1658
1659 static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
1660 {
1661         int r;
1662         int srcu_idx;
1663
1664         vcpu->stat.sum_exits++;
1665
1666         /*
1667          * This can happen if an interrupt occurs in the last stages
1668          * of guest entry or the first stages of guest exit (i.e. after
1669          * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
1670          * and before setting it to KVM_GUEST_MODE_HOST_HV).
1671          * That can happen due to a bug, or due to a machine check
1672          * occurring at just the wrong time.
1673          */
1674         if (vcpu->arch.shregs.msr & MSR_HV) {
1675                 pr_emerg("KVM trap in HV mode while nested!\n");
1676                 pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n",
1677                          vcpu->arch.trap, kvmppc_get_pc(vcpu),
1678                          vcpu->arch.shregs.msr);
1679                 kvmppc_dump_regs(vcpu);
1680                 return RESUME_HOST;
1681         }
1682         switch (vcpu->arch.trap) {
1683         /* We're good on these - the host merely wanted to get our attention */
1684         case BOOK3S_INTERRUPT_HV_DECREMENTER:
1685                 vcpu->stat.dec_exits++;
1686                 r = RESUME_GUEST;
1687                 break;
1688         case BOOK3S_INTERRUPT_EXTERNAL:
1689                 vcpu->stat.ext_intr_exits++;
1690                 r = RESUME_HOST;
1691                 break;
1692         case BOOK3S_INTERRUPT_H_DOORBELL:
1693         case BOOK3S_INTERRUPT_H_VIRT:
1694                 vcpu->stat.ext_intr_exits++;
1695                 r = RESUME_GUEST;
1696                 break;
1697         /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
1698         case BOOK3S_INTERRUPT_HMI:
1699         case BOOK3S_INTERRUPT_PERFMON:
1700         case BOOK3S_INTERRUPT_SYSTEM_RESET:
1701                 r = RESUME_GUEST;
1702                 break;
1703         case BOOK3S_INTERRUPT_MACHINE_CHECK:
1704         {
1705                 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
1706                                               DEFAULT_RATELIMIT_BURST);
1707                 /* Pass the machine check to the L1 guest */
1708                 r = RESUME_HOST;
1709                 /* Print the MCE event to host console. */
1710                 if (__ratelimit(&rs))
1711                         machine_check_print_event_info(&vcpu->arch.mce_evt, false, true);
1712                 break;
1713         }
1714         /*
1715          * We get these next two if the guest accesses a page which it thinks
1716          * it has mapped but which is not actually present, either because
1717          * it is for an emulated I/O device or because the corresonding
1718          * host page has been paged out.
1719          */
1720         case BOOK3S_INTERRUPT_H_DATA_STORAGE:
1721                 srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1722                 r = kvmhv_nested_page_fault(vcpu);
1723                 srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
1724                 break;
1725         case BOOK3S_INTERRUPT_H_INST_STORAGE:
1726                 vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
1727                 vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
1728                                          DSISR_SRR1_MATCH_64S;
1729                 if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
1730                         vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
1731                 srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1732                 r = kvmhv_nested_page_fault(vcpu);
1733                 srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
1734                 break;
1735
1736 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
1737         case BOOK3S_INTERRUPT_HV_SOFTPATCH:
1738                 /*
1739                  * This occurs for various TM-related instructions that
1740                  * we need to emulate on POWER9 DD2.2.  We have already
1741                  * handled the cases where the guest was in real-suspend
1742                  * mode and was transitioning to transactional state.
1743                  */
1744                 r = kvmhv_p9_tm_emulation(vcpu);
1745                 break;
1746 #endif
1747
1748         case BOOK3S_INTERRUPT_HV_RM_HARD:
1749                 vcpu->arch.trap = 0;
1750                 r = RESUME_GUEST;
1751                 if (!xics_on_xive())
1752                         kvmppc_xics_rm_complete(vcpu, 0);
1753                 break;
1754         default:
1755                 r = RESUME_HOST;
1756                 break;
1757         }
1758
1759         return r;
1760 }
1761
1762 static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
1763                                             struct kvm_sregs *sregs)
1764 {
1765         int i;
1766
1767         memset(sregs, 0, sizeof(struct kvm_sregs));
1768         sregs->pvr = vcpu->arch.pvr;
1769         for (i = 0; i < vcpu->arch.slb_max; i++) {
1770                 sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
1771                 sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
1772         }
1773
1774         return 0;
1775 }
1776
1777 static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu,
1778                                             struct kvm_sregs *sregs)
1779 {
1780         int i, j;
1781
1782         /* Only accept the same PVR as the host's, since we can't spoof it */
1783         if (sregs->pvr != vcpu->arch.pvr)
1784                 return -EINVAL;
1785
1786         j = 0;
1787         for (i = 0; i < vcpu->arch.slb_nr; i++) {
1788                 if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
1789                         vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
1790                         vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
1791                         ++j;
1792                 }
1793         }
1794         vcpu->arch.slb_max = j;
1795
1796         return 0;
1797 }
1798
1799 /*
1800  * Enforce limits on guest LPCR values based on hardware availability,
1801  * guest configuration, and possibly hypervisor support and security
1802  * concerns.
1803  */
1804 unsigned long kvmppc_filter_lpcr_hv(struct kvm *kvm, unsigned long lpcr)
1805 {
1806         /* LPCR_TC only applies to HPT guests */
1807         if (kvm_is_radix(kvm))
1808                 lpcr &= ~LPCR_TC;
1809
1810         /* On POWER8 and above, userspace can modify AIL */
1811         if (!cpu_has_feature(CPU_FTR_ARCH_207S))
1812                 lpcr &= ~LPCR_AIL;
1813         if ((lpcr & LPCR_AIL) != LPCR_AIL_3)
1814                 lpcr &= ~LPCR_AIL; /* LPCR[AIL]=1/2 is disallowed */
1815         /*
1816          * On some POWER9s we force AIL off for radix guests to prevent
1817          * executing in MSR[HV]=1 mode with the MMU enabled and PIDR set to
1818          * guest, which can result in Q0 translations with LPID=0 PID=PIDR to
1819          * be cached, which the host TLB management does not expect.
1820          */
1821         if (kvm_is_radix(kvm) && cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
1822                 lpcr &= ~LPCR_AIL;
1823
1824         /*
1825          * On POWER9, allow userspace to enable large decrementer for the
1826          * guest, whether or not the host has it enabled.
1827          */
1828         if (!cpu_has_feature(CPU_FTR_ARCH_300))
1829                 lpcr &= ~LPCR_LD;
1830
1831         return lpcr;
1832 }
1833
1834 static void verify_lpcr(struct kvm *kvm, unsigned long lpcr)
1835 {
1836         if (lpcr != kvmppc_filter_lpcr_hv(kvm, lpcr)) {
1837                 WARN_ONCE(1, "lpcr 0x%lx differs from filtered 0x%lx\n",
1838                           lpcr, kvmppc_filter_lpcr_hv(kvm, lpcr));
1839         }
1840 }
1841
1842 static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
1843                 bool preserve_top32)
1844 {
1845         struct kvm *kvm = vcpu->kvm;
1846         struct kvmppc_vcore *vc = vcpu->arch.vcore;
1847         u64 mask;
1848
1849         spin_lock(&vc->lock);
1850
1851         /*
1852          * Userspace can only modify
1853          * DPFD (default prefetch depth), ILE (interrupt little-endian),
1854          * TC (translation control), AIL (alternate interrupt location),
1855          * LD (large decrementer).
1856          * These are subject to restrictions from kvmppc_filter_lcpr_hv().
1857          */
1858         mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD;
1859
1860         /* Broken 32-bit version of LPCR must not clear top bits */
1861         if (preserve_top32)
1862                 mask &= 0xFFFFFFFF;
1863
1864         new_lpcr = kvmppc_filter_lpcr_hv(kvm,
1865                         (vc->lpcr & ~mask) | (new_lpcr & mask));
1866
1867         /*
1868          * If ILE (interrupt little-endian) has changed, update the
1869          * MSR_LE bit in the intr_msr for each vcpu in this vcore.
1870          */
1871         if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) {
1872                 struct kvm_vcpu *vcpu;
1873                 int i;
1874
1875                 kvm_for_each_vcpu(i, vcpu, kvm) {
1876                         if (vcpu->arch.vcore != vc)
1877                                 continue;
1878                         if (new_lpcr & LPCR_ILE)
1879                                 vcpu->arch.intr_msr |= MSR_LE;
1880                         else
1881                                 vcpu->arch.intr_msr &= ~MSR_LE;
1882                 }
1883         }
1884
1885         vc->lpcr = new_lpcr;
1886
1887         spin_unlock(&vc->lock);
1888 }
1889
1890 static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
1891                                  union kvmppc_one_reg *val)
1892 {
1893         int r = 0;
1894         long int i;
1895
1896         switch (id) {
1897         case KVM_REG_PPC_DEBUG_INST:
1898                 *val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT);
1899                 break;
1900         case KVM_REG_PPC_HIOR:
1901                 *val = get_reg_val(id, 0);
1902                 break;
1903         case KVM_REG_PPC_DABR:
1904                 *val = get_reg_val(id, vcpu->arch.dabr);
1905                 break;
1906         case KVM_REG_PPC_DABRX:
1907                 *val = get_reg_val(id, vcpu->arch.dabrx);
1908                 break;
1909         case KVM_REG_PPC_DSCR:
1910                 *val = get_reg_val(id, vcpu->arch.dscr);
1911                 break;
1912         case KVM_REG_PPC_PURR:
1913                 *val = get_reg_val(id, vcpu->arch.purr);
1914                 break;
1915         case KVM_REG_PPC_SPURR:
1916                 *val = get_reg_val(id, vcpu->arch.spurr);
1917                 break;
1918         case KVM_REG_PPC_AMR:
1919                 *val = get_reg_val(id, vcpu->arch.amr);
1920                 break;
1921         case KVM_REG_PPC_UAMOR:
1922                 *val = get_reg_val(id, vcpu->arch.uamor);
1923                 break;
1924         case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
1925                 i = id - KVM_REG_PPC_MMCR0;
1926                 *val = get_reg_val(id, vcpu->arch.mmcr[i]);
1927                 break;
1928         case KVM_REG_PPC_MMCR2:
1929                 *val = get_reg_val(id, vcpu->arch.mmcr[2]);
1930                 break;
1931         case KVM_REG_PPC_MMCRA:
1932                 *val = get_reg_val(id, vcpu->arch.mmcra);
1933                 break;
1934         case KVM_REG_PPC_MMCRS:
1935                 *val = get_reg_val(id, vcpu->arch.mmcrs);
1936                 break;
1937         case KVM_REG_PPC_MMCR3:
1938                 *val = get_reg_val(id, vcpu->arch.mmcr[3]);
1939                 break;
1940         case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
1941                 i = id - KVM_REG_PPC_PMC1;
1942                 *val = get_reg_val(id, vcpu->arch.pmc[i]);
1943                 break;
1944         case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
1945                 i = id - KVM_REG_PPC_SPMC1;
1946                 *val = get_reg_val(id, vcpu->arch.spmc[i]);
1947                 break;
1948         case KVM_REG_PPC_SIAR:
1949                 *val = get_reg_val(id, vcpu->arch.siar);
1950                 break;
1951         case KVM_REG_PPC_SDAR:
1952                 *val = get_reg_val(id, vcpu->arch.sdar);
1953                 break;
1954         case KVM_REG_PPC_SIER:
1955                 *val = get_reg_val(id, vcpu->arch.sier[0]);
1956                 break;
1957         case KVM_REG_PPC_SIER2:
1958                 *val = get_reg_val(id, vcpu->arch.sier[1]);
1959                 break;
1960         case KVM_REG_PPC_SIER3:
1961                 *val = get_reg_val(id, vcpu->arch.sier[2]);
1962                 break;
1963         case KVM_REG_PPC_IAMR:
1964                 *val = get_reg_val(id, vcpu->arch.iamr);
1965                 break;
1966         case KVM_REG_PPC_PSPB:
1967                 *val = get_reg_val(id, vcpu->arch.pspb);
1968                 break;
1969         case KVM_REG_PPC_DPDES:
1970                 /*
1971                  * On POWER9, where we are emulating msgsndp etc.,
1972                  * we return 1 bit for each vcpu, which can come from
1973                  * either vcore->dpdes or doorbell_request.
1974                  * On POWER8, doorbell_request is 0.
1975                  */
1976                 *val = get_reg_val(id, vcpu->arch.vcore->dpdes |
1977                                    vcpu->arch.doorbell_request);
1978                 break;
1979         case KVM_REG_PPC_VTB:
1980                 *val = get_reg_val(id, vcpu->arch.vcore->vtb);
1981                 break;
1982         case KVM_REG_PPC_DAWR:
1983                 *val = get_reg_val(id, vcpu->arch.dawr0);
1984                 break;
1985         case KVM_REG_PPC_DAWRX:
1986                 *val = get_reg_val(id, vcpu->arch.dawrx0);
1987                 break;
1988         case KVM_REG_PPC_DAWR1:
1989                 *val = get_reg_val(id, vcpu->arch.dawr1);
1990                 break;
1991         case KVM_REG_PPC_DAWRX1:
1992                 *val = get_reg_val(id, vcpu->arch.dawrx1);
1993                 break;
1994         case KVM_REG_PPC_CIABR:
1995                 *val = get_reg_val(id, vcpu->arch.ciabr);
1996                 break;
1997         case KVM_REG_PPC_CSIGR:
1998                 *val = get_reg_val(id, vcpu->arch.csigr);
1999                 break;
2000         case KVM_REG_PPC_TACR:
2001                 *val = get_reg_val(id, vcpu->arch.tacr);
2002                 break;
2003         case KVM_REG_PPC_TCSCR:
2004                 *val = get_reg_val(id, vcpu->arch.tcscr);
2005                 break;
2006         case KVM_REG_PPC_PID:
2007                 *val = get_reg_val(id, vcpu->arch.pid);
2008                 break;
2009         case KVM_REG_PPC_ACOP:
2010                 *val = get_reg_val(id, vcpu->arch.acop);
2011                 break;
2012         case KVM_REG_PPC_WORT:
2013                 *val = get_reg_val(id, vcpu->arch.wort);
2014                 break;
2015         case KVM_REG_PPC_TIDR:
2016                 *val = get_reg_val(id, vcpu->arch.tid);
2017                 break;
2018         case KVM_REG_PPC_PSSCR:
2019                 *val = get_reg_val(id, vcpu->arch.psscr);
2020                 break;
2021         case KVM_REG_PPC_VPA_ADDR:
2022                 spin_lock(&vcpu->arch.vpa_update_lock);
2023                 *val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
2024                 spin_unlock(&vcpu->arch.vpa_update_lock);
2025                 break;
2026         case KVM_REG_PPC_VPA_SLB:
2027                 spin_lock(&vcpu->arch.vpa_update_lock);
2028                 val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa;
2029                 val->vpaval.length = vcpu->arch.slb_shadow.len;
2030                 spin_unlock(&vcpu->arch.vpa_update_lock);
2031                 break;
2032         case KVM_REG_PPC_VPA_DTL:
2033                 spin_lock(&vcpu->arch.vpa_update_lock);
2034                 val->vpaval.addr = vcpu->arch.dtl.next_gpa;
2035                 val->vpaval.length = vcpu->arch.dtl.len;
2036                 spin_unlock(&vcpu->arch.vpa_update_lock);
2037                 break;
2038         case KVM_REG_PPC_TB_OFFSET:
2039                 *val = get_reg_val(id, vcpu->arch.vcore->tb_offset);
2040                 break;
2041         case KVM_REG_PPC_LPCR:
2042         case KVM_REG_PPC_LPCR_64:
2043                 *val = get_reg_val(id, vcpu->arch.vcore->lpcr);
2044                 break;
2045         case KVM_REG_PPC_PPR:
2046                 *val = get_reg_val(id, vcpu->arch.ppr);
2047                 break;
2048 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
2049         case KVM_REG_PPC_TFHAR:
2050                 *val = get_reg_val(id, vcpu->arch.tfhar);
2051                 break;
2052         case KVM_REG_PPC_TFIAR:
2053                 *val = get_reg_val(id, vcpu->arch.tfiar);
2054                 break;
2055         case KVM_REG_PPC_TEXASR:
2056                 *val = get_reg_val(id, vcpu->arch.texasr);
2057                 break;
2058         case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
2059                 i = id - KVM_REG_PPC_TM_GPR0;
2060                 *val = get_reg_val(id, vcpu->arch.gpr_tm[i]);
2061                 break;
2062         case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
2063         {
2064                 int j;
2065                 i = id - KVM_REG_PPC_TM_VSR0;
2066                 if (i < 32)
2067                         for (j = 0; j < TS_FPRWIDTH; j++)
2068                                 val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j];
2069                 else {
2070                         if (cpu_has_feature(CPU_FTR_ALTIVEC))
2071                                 val->vval = vcpu->arch.vr_tm.vr[i-32];
2072                         else
2073                                 r = -ENXIO;
2074                 }
2075                 break;
2076         }
2077         case KVM_REG_PPC_TM_CR:
2078                 *val = get_reg_val(id, vcpu->arch.cr_tm);
2079                 break;
2080         case KVM_REG_PPC_TM_XER:
2081                 *val = get_reg_val(id, vcpu->arch.xer_tm);
2082                 break;
2083         case KVM_REG_PPC_TM_LR:
2084                 *val = get_reg_val(id, vcpu->arch.lr_tm);
2085                 break;
2086         case KVM_REG_PPC_TM_CTR:
2087                 *val = get_reg_val(id, vcpu->arch.ctr_tm);
2088                 break;
2089         case KVM_REG_PPC_TM_FPSCR:
2090                 *val = get_reg_val(id, vcpu->arch.fp_tm.fpscr);
2091                 break;
2092         case KVM_REG_PPC_TM_AMR:
2093                 *val = get_reg_val(id, vcpu->arch.amr_tm);
2094                 break;
2095         case KVM_REG_PPC_TM_PPR:
2096                 *val = get_reg_val(id, vcpu->arch.ppr_tm);
2097                 break;
2098         case KVM_REG_PPC_TM_VRSAVE:
2099                 *val = get_reg_val(id, vcpu->arch.vrsave_tm);
2100                 break;
2101         case KVM_REG_PPC_TM_VSCR:
2102                 if (cpu_has_feature(CPU_FTR_ALTIVEC))
2103                         *val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]);
2104                 else
2105                         r = -ENXIO;
2106                 break;
2107         case KVM_REG_PPC_TM_DSCR:
2108                 *val = get_reg_val(id, vcpu->arch.dscr_tm);
2109                 break;
2110         case KVM_REG_PPC_TM_TAR:
2111                 *val = get_reg_val(id, vcpu->arch.tar_tm);
2112                 break;
2113 #endif
2114         case KVM_REG_PPC_ARCH_COMPAT:
2115                 *val = get_reg_val(id, vcpu->arch.vcore->arch_compat);
2116                 break;
2117         case KVM_REG_PPC_DEC_EXPIRY:
2118                 *val = get_reg_val(id, vcpu->arch.dec_expires +
2119                                    vcpu->arch.vcore->tb_offset);
2120                 break;
2121         case KVM_REG_PPC_ONLINE:
2122                 *val = get_reg_val(id, vcpu->arch.online);
2123                 break;
2124         case KVM_REG_PPC_PTCR:
2125                 *val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr);
2126                 break;
2127         default:
2128                 r = -EINVAL;
2129                 break;
2130         }
2131
2132         return r;
2133 }
2134
2135 static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
2136                                  union kvmppc_one_reg *val)
2137 {
2138         int r = 0;
2139         long int i;
2140         unsigned long addr, len;
2141
2142         switch (id) {
2143         case KVM_REG_PPC_HIOR:
2144                 /* Only allow this to be set to zero */
2145                 if (set_reg_val(id, *val))
2146                         r = -EINVAL;
2147                 break;
2148         case KVM_REG_PPC_DABR:
2149                 vcpu->arch.dabr = set_reg_val(id, *val);
2150                 break;
2151         case KVM_REG_PPC_DABRX:
2152                 vcpu->arch.dabrx = set_reg_val(id, *val) & ~DABRX_HYP;
2153                 break;
2154         case KVM_REG_PPC_DSCR:
2155                 vcpu->arch.dscr = set_reg_val(id, *val);
2156                 break;
2157         case KVM_REG_PPC_PURR:
2158                 vcpu->arch.purr = set_reg_val(id, *val);
2159                 break;
2160         case KVM_REG_PPC_SPURR:
2161                 vcpu->arch.spurr = set_reg_val(id, *val);
2162                 break;
2163         case KVM_REG_PPC_AMR:
2164                 vcpu->arch.amr = set_reg_val(id, *val);
2165                 break;
2166         case KVM_REG_PPC_UAMOR:
2167                 vcpu->arch.uamor = set_reg_val(id, *val);
2168                 break;
2169         case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
2170                 i = id - KVM_REG_PPC_MMCR0;
2171                 vcpu->arch.mmcr[i] = set_reg_val(id, *val);
2172                 break;
2173         case KVM_REG_PPC_MMCR2:
2174                 vcpu->arch.mmcr[2] = set_reg_val(id, *val);
2175                 break;
2176         case KVM_REG_PPC_MMCRA:
2177                 vcpu->arch.mmcra = set_reg_val(id, *val);
2178                 break;
2179         case KVM_REG_PPC_MMCRS:
2180                 vcpu->arch.mmcrs = set_reg_val(id, *val);
2181                 break;
2182         case KVM_REG_PPC_MMCR3:
2183                 *val = get_reg_val(id, vcpu->arch.mmcr[3]);
2184                 break;
2185         case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
2186                 i = id - KVM_REG_PPC_PMC1;
2187                 vcpu->arch.pmc[i] = set_reg_val(id, *val);
2188                 break;
2189         case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
2190                 i = id - KVM_REG_PPC_SPMC1;
2191                 vcpu->arch.spmc[i] = set_reg_val(id, *val);
2192                 break;
2193         case KVM_REG_PPC_SIAR:
2194                 vcpu->arch.siar = set_reg_val(id, *val);
2195                 break;
2196         case KVM_REG_PPC_SDAR:
2197                 vcpu->arch.sdar = set_reg_val(id, *val);
2198                 break;
2199         case KVM_REG_PPC_SIER:
2200                 vcpu->arch.sier[0] = set_reg_val(id, *val);
2201                 break;
2202         case KVM_REG_PPC_SIER2:
2203                 vcpu->arch.sier[1] = set_reg_val(id, *val);
2204                 break;
2205         case KVM_REG_PPC_SIER3:
2206                 vcpu->arch.sier[2] = set_reg_val(id, *val);
2207                 break;
2208         case KVM_REG_PPC_IAMR:
2209                 vcpu->arch.iamr = set_reg_val(id, *val);
2210                 break;
2211         case KVM_REG_PPC_PSPB:
2212                 vcpu->arch.pspb = set_reg_val(id, *val);
2213                 break;
2214         case KVM_REG_PPC_DPDES:
2215                 vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
2216                 break;
2217         case KVM_REG_PPC_VTB:
2218                 vcpu->arch.vcore->vtb = set_reg_val(id, *val);
2219                 break;
2220         case KVM_REG_PPC_DAWR:
2221                 vcpu->arch.dawr0 = set_reg_val(id, *val);
2222                 break;
2223         case KVM_REG_PPC_DAWRX:
2224                 vcpu->arch.dawrx0 = set_reg_val(id, *val) & ~DAWRX_HYP;
2225                 break;
2226         case KVM_REG_PPC_DAWR1:
2227                 vcpu->arch.dawr1 = set_reg_val(id, *val);
2228                 break;
2229         case KVM_REG_PPC_DAWRX1:
2230                 vcpu->arch.dawrx1 = set_reg_val(id, *val) & ~DAWRX_HYP;
2231                 break;
2232         case KVM_REG_PPC_CIABR:
2233                 vcpu->arch.ciabr = set_reg_val(id, *val);
2234                 /* Don't allow setting breakpoints in hypervisor code */
2235                 if ((vcpu->arch.ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
2236                         vcpu->arch.ciabr &= ~CIABR_PRIV;        /* disable */
2237                 break;
2238         case KVM_REG_PPC_CSIGR:
2239                 vcpu->arch.csigr = set_reg_val(id, *val);
2240                 break;
2241         case KVM_REG_PPC_TACR:
2242                 vcpu->arch.tacr = set_reg_val(id, *val);
2243                 break;
2244         case KVM_REG_PPC_TCSCR:
2245                 vcpu->arch.tcscr = set_reg_val(id, *val);
2246                 break;
2247         case KVM_REG_PPC_PID:
2248                 vcpu->arch.pid = set_reg_val(id, *val);
2249                 break;
2250         case KVM_REG_PPC_ACOP:
2251                 vcpu->arch.acop = set_reg_val(id, *val);
2252                 break;
2253         case KVM_REG_PPC_WORT:
2254                 vcpu->arch.wort = set_reg_val(id, *val);
2255                 break;
2256         case KVM_REG_PPC_TIDR:
2257                 vcpu->arch.tid = set_reg_val(id, *val);
2258                 break;
2259         case KVM_REG_PPC_PSSCR:
2260                 vcpu->arch.psscr = set_reg_val(id, *val) & PSSCR_GUEST_VIS;
2261                 break;
2262         case KVM_REG_PPC_VPA_ADDR:
2263                 addr = set_reg_val(id, *val);
2264                 r = -EINVAL;
2265                 if (!addr && (vcpu->arch.slb_shadow.next_gpa ||
2266                               vcpu->arch.dtl.next_gpa))
2267                         break;
2268                 r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca));
2269                 break;
2270         case KVM_REG_PPC_VPA_SLB:
2271                 addr = val->vpaval.addr;
2272                 len = val->vpaval.length;
2273                 r = -EINVAL;
2274                 if (addr && !vcpu->arch.vpa.next_gpa)
2275                         break;
2276                 r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len);
2277                 break;
2278         case KVM_REG_PPC_VPA_DTL:
2279                 addr = val->vpaval.addr;
2280                 len = val->vpaval.length;
2281                 r = -EINVAL;
2282                 if (addr && (len < sizeof(struct dtl_entry) ||
2283                              !vcpu->arch.vpa.next_gpa))
2284                         break;
2285                 len -= len % sizeof(struct dtl_entry);
2286                 r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
2287                 break;
2288         case KVM_REG_PPC_TB_OFFSET:
2289                 /* round up to multiple of 2^24 */
2290                 vcpu->arch.vcore->tb_offset =
2291                         ALIGN(set_reg_val(id, *val), 1UL << 24);
2292                 break;
2293         case KVM_REG_PPC_LPCR:
2294                 kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), true);
2295                 break;
2296         case KVM_REG_PPC_LPCR_64:
2297                 kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), false);
2298                 break;
2299         case KVM_REG_PPC_PPR:
2300                 vcpu->arch.ppr = set_reg_val(id, *val);
2301                 break;
2302 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
2303         case KVM_REG_PPC_TFHAR:
2304                 vcpu->arch.tfhar = set_reg_val(id, *val);
2305                 break;
2306         case KVM_REG_PPC_TFIAR:
2307                 vcpu->arch.tfiar = set_reg_val(id, *val);
2308                 break;
2309         case KVM_REG_PPC_TEXASR:
2310                 vcpu->arch.texasr = set_reg_val(id, *val);
2311                 break;
2312         case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
2313                 i = id - KVM_REG_PPC_TM_GPR0;
2314                 vcpu->arch.gpr_tm[i] = set_reg_val(id, *val);
2315                 break;
2316         case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
2317         {
2318                 int j;
2319                 i = id - KVM_REG_PPC_TM_VSR0;
2320                 if (i < 32)
2321                         for (j = 0; j < TS_FPRWIDTH; j++)
2322                                 vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j];
2323                 else
2324                         if (cpu_has_feature(CPU_FTR_ALTIVEC))
2325                                 vcpu->arch.vr_tm.vr[i-32] = val->vval;
2326                         else
2327                                 r = -ENXIO;
2328                 break;
2329         }
2330         case KVM_REG_PPC_TM_CR:
2331                 vcpu->arch.cr_tm = set_reg_val(id, *val);
2332                 break;
2333         case KVM_REG_PPC_TM_XER:
2334                 vcpu->arch.xer_tm = set_reg_val(id, *val);
2335                 break;
2336         case KVM_REG_PPC_TM_LR:
2337                 vcpu->arch.lr_tm = set_reg_val(id, *val);
2338                 break;
2339         case KVM_REG_PPC_TM_CTR:
2340                 vcpu->arch.ctr_tm = set_reg_val(id, *val);
2341                 break;
2342         case KVM_REG_PPC_TM_FPSCR:
2343                 vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val);
2344                 break;
2345         case KVM_REG_PPC_TM_AMR:
2346                 vcpu->arch.amr_tm = set_reg_val(id, *val);
2347                 break;
2348         case KVM_REG_PPC_TM_PPR:
2349                 vcpu->arch.ppr_tm = set_reg_val(id, *val);
2350                 break;
2351         case KVM_REG_PPC_TM_VRSAVE:
2352                 vcpu->arch.vrsave_tm = set_reg_val(id, *val);
2353                 break;
2354         case KVM_REG_PPC_TM_VSCR:
2355                 if (cpu_has_feature(CPU_FTR_ALTIVEC))
2356                         vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val);
2357                 else
2358                         r = - ENXIO;
2359                 break;
2360         case KVM_REG_PPC_TM_DSCR:
2361                 vcpu->arch.dscr_tm = set_reg_val(id, *val);
2362                 break;
2363         case KVM_REG_PPC_TM_TAR:
2364                 vcpu->arch.tar_tm = set_reg_val(id, *val);
2365                 break;
2366 #endif
2367         case KVM_REG_PPC_ARCH_COMPAT:
2368                 r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
2369                 break;
2370         case KVM_REG_PPC_DEC_EXPIRY:
2371                 vcpu->arch.dec_expires = set_reg_val(id, *val) -
2372                         vcpu->arch.vcore->tb_offset;
2373                 break;
2374         case KVM_REG_PPC_ONLINE:
2375                 i = set_reg_val(id, *val);
2376                 if (i && !vcpu->arch.online)
2377                         atomic_inc(&vcpu->arch.vcore->online_count);
2378                 else if (!i && vcpu->arch.online)
2379                         atomic_dec(&vcpu->arch.vcore->online_count);
2380                 vcpu->arch.online = i;
2381                 break;
2382         case KVM_REG_PPC_PTCR:
2383                 vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val);
2384                 break;
2385         default:
2386                 r = -EINVAL;
2387                 break;
2388         }
2389
2390         return r;
2391 }
2392
2393 /*
2394  * On POWER9, threads are independent and can be in different partitions.
2395  * Therefore we consider each thread to be a subcore.
2396  * There is a restriction that all threads have to be in the same
2397  * MMU mode (radix or HPT), unfortunately, but since we only support
2398  * HPT guests on a HPT host so far, that isn't an impediment yet.
2399  */
2400 static int threads_per_vcore(struct kvm *kvm)
2401 {
2402         if (cpu_has_feature(CPU_FTR_ARCH_300))
2403                 return 1;
2404         return threads_per_subcore;
2405 }
2406
2407 static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int id)
2408 {
2409         struct kvmppc_vcore *vcore;
2410
2411         vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
2412
2413         if (vcore == NULL)
2414                 return NULL;
2415
2416         spin_lock_init(&vcore->lock);
2417         spin_lock_init(&vcore->stoltb_lock);
2418         rcuwait_init(&vcore->wait);
2419         vcore->preempt_tb = TB_NIL;
2420         vcore->lpcr = kvm->arch.lpcr;
2421         vcore->first_vcpuid = id;
2422         vcore->kvm = kvm;
2423         INIT_LIST_HEAD(&vcore->preempt_list);
2424
2425         return vcore;
2426 }
2427
2428 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
2429 static struct debugfs_timings_element {
2430         const char *name;
2431         size_t offset;
2432 } timings[] = {
2433         {"rm_entry",    offsetof(struct kvm_vcpu, arch.rm_entry)},
2434         {"rm_intr",     offsetof(struct kvm_vcpu, arch.rm_intr)},
2435         {"rm_exit",     offsetof(struct kvm_vcpu, arch.rm_exit)},
2436         {"guest",       offsetof(struct kvm_vcpu, arch.guest_time)},
2437         {"cede",        offsetof(struct kvm_vcpu, arch.cede_time)},
2438 };
2439
2440 #define N_TIMINGS       (ARRAY_SIZE(timings))
2441
2442 struct debugfs_timings_state {
2443         struct kvm_vcpu *vcpu;
2444         unsigned int    buflen;
2445         char            buf[N_TIMINGS * 100];
2446 };
2447
2448 static int debugfs_timings_open(struct inode *inode, struct file *file)
2449 {
2450         struct kvm_vcpu *vcpu = inode->i_private;
2451         struct debugfs_timings_state *p;
2452
2453         p = kzalloc(sizeof(*p), GFP_KERNEL);
2454         if (!p)
2455                 return -ENOMEM;
2456
2457         kvm_get_kvm(vcpu->kvm);
2458         p->vcpu = vcpu;
2459         file->private_data = p;
2460
2461         return nonseekable_open(inode, file);
2462 }
2463
2464 static int debugfs_timings_release(struct inode *inode, struct file *file)
2465 {
2466         struct debugfs_timings_state *p = file->private_data;
2467
2468         kvm_put_kvm(p->vcpu->kvm);
2469         kfree(p);
2470         return 0;
2471 }
2472
2473 static ssize_t debugfs_timings_read(struct file *file, char __user *buf,
2474                                     size_t len, loff_t *ppos)
2475 {
2476         struct debugfs_timings_state *p = file->private_data;
2477         struct kvm_vcpu *vcpu = p->vcpu;
2478         char *s, *buf_end;
2479         struct kvmhv_tb_accumulator tb;
2480         u64 count;
2481         loff_t pos;
2482         ssize_t n;
2483         int i, loops;
2484         bool ok;
2485
2486         if (!p->buflen) {
2487                 s = p->buf;
2488                 buf_end = s + sizeof(p->buf);
2489                 for (i = 0; i < N_TIMINGS; ++i) {
2490                         struct kvmhv_tb_accumulator *acc;
2491
2492                         acc = (struct kvmhv_tb_accumulator *)
2493                                 ((unsigned long)vcpu + timings[i].offset);
2494                         ok = false;
2495                         for (loops = 0; loops < 1000; ++loops) {
2496                                 count = acc->seqcount;
2497                                 if (!(count & 1)) {
2498                                         smp_rmb();
2499                                         tb = *acc;
2500                                         smp_rmb();
2501                                         if (count == acc->seqcount) {
2502                                                 ok = true;
2503                                                 break;
2504                                         }
2505                                 }
2506                                 udelay(1);
2507                         }
2508                         if (!ok)
2509                                 snprintf(s, buf_end - s, "%s: stuck\n",
2510                                         timings[i].name);
2511                         else
2512                                 snprintf(s, buf_end - s,
2513                                         "%s: %llu %llu %llu %llu\n",
2514                                         timings[i].name, count / 2,
2515                                         tb_to_ns(tb.tb_total),
2516                                         tb_to_ns(tb.tb_min),
2517                                         tb_to_ns(tb.tb_max));
2518                         s += strlen(s);
2519                 }
2520                 p->buflen = s - p->buf;
2521         }
2522
2523         pos = *ppos;
2524         if (pos >= p->buflen)
2525                 return 0;
2526         if (len > p->buflen - pos)
2527                 len = p->buflen - pos;
2528         n = copy_to_user(buf, p->buf + pos, len);
2529         if (n) {
2530                 if (n == len)
2531                         return -EFAULT;
2532                 len -= n;
2533         }
2534         *ppos = pos + len;
2535         return len;
2536 }
2537
2538 static ssize_t debugfs_timings_write(struct file *file, const char __user *buf,
2539                                      size_t len, loff_t *ppos)
2540 {
2541         return -EACCES;
2542 }
2543
2544 static const struct file_operations debugfs_timings_ops = {
2545         .owner   = THIS_MODULE,
2546         .open    = debugfs_timings_open,
2547         .release = debugfs_timings_release,
2548         .read    = debugfs_timings_read,
2549         .write   = debugfs_timings_write,
2550         .llseek  = generic_file_llseek,
2551 };
2552
2553 /* Create a debugfs directory for the vcpu */
2554 static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
2555 {
2556         char buf[16];
2557         struct kvm *kvm = vcpu->kvm;
2558
2559         snprintf(buf, sizeof(buf), "vcpu%u", id);
2560         vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir);
2561         debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir, vcpu,
2562                             &debugfs_timings_ops);
2563 }
2564
2565 #else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
2566 static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
2567 {
2568 }
2569 #endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
2570
2571 static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu)
2572 {
2573         int err;
2574         int core;
2575         struct kvmppc_vcore *vcore;
2576         struct kvm *kvm;
2577         unsigned int id;
2578
2579         kvm = vcpu->kvm;
2580         id = vcpu->vcpu_id;
2581
2582         vcpu->arch.shared = &vcpu->arch.shregs;
2583 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
2584         /*
2585          * The shared struct is never shared on HV,
2586          * so we can always use host endianness
2587          */
2588 #ifdef __BIG_ENDIAN__
2589         vcpu->arch.shared_big_endian = true;
2590 #else
2591         vcpu->arch.shared_big_endian = false;
2592 #endif
2593 #endif
2594         vcpu->arch.mmcr[0] = MMCR0_FC;
2595         vcpu->arch.ctrl = CTRL_RUNLATCH;
2596         /* default to host PVR, since we can't spoof it */
2597         kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR));
2598         spin_lock_init(&vcpu->arch.vpa_update_lock);
2599         spin_lock_init(&vcpu->arch.tbacct_lock);
2600         vcpu->arch.busy_preempt = TB_NIL;
2601         vcpu->arch.intr_msr = MSR_SF | MSR_ME;
2602
2603         /*
2604          * Set the default HFSCR for the guest from the host value.
2605          * This value is only used on POWER9.
2606          * On POWER9, we want to virtualize the doorbell facility, so we
2607          * don't set the HFSCR_MSGP bit, and that causes those instructions
2608          * to trap and then we emulate them.
2609          */
2610         vcpu->arch.hfscr = HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB |
2611                 HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP | HFSCR_PREFIX;
2612         if (cpu_has_feature(CPU_FTR_HVMODE)) {
2613                 vcpu->arch.hfscr &= mfspr(SPRN_HFSCR);
2614                 if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
2615                         vcpu->arch.hfscr |= HFSCR_TM;
2616         }
2617         if (cpu_has_feature(CPU_FTR_TM_COMP))
2618                 vcpu->arch.hfscr |= HFSCR_TM;
2619
2620         kvmppc_mmu_book3s_hv_init(vcpu);
2621
2622         vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
2623
2624         init_waitqueue_head(&vcpu->arch.cpu_run);
2625
2626         mutex_lock(&kvm->lock);
2627         vcore = NULL;
2628         err = -EINVAL;
2629         if (cpu_has_feature(CPU_FTR_ARCH_300)) {
2630                 if (id >= (KVM_MAX_VCPUS * kvm->arch.emul_smt_mode)) {
2631                         pr_devel("KVM: VCPU ID too high\n");
2632                         core = KVM_MAX_VCORES;
2633                 } else {
2634                         BUG_ON(kvm->arch.smt_mode != 1);
2635                         core = kvmppc_pack_vcpu_id(kvm, id);
2636                 }
2637         } else {
2638                 core = id / kvm->arch.smt_mode;
2639         }
2640         if (core < KVM_MAX_VCORES) {
2641                 vcore = kvm->arch.vcores[core];
2642                 if (vcore && cpu_has_feature(CPU_FTR_ARCH_300)) {
2643                         pr_devel("KVM: collision on id %u", id);
2644                         vcore = NULL;
2645                 } else if (!vcore) {
2646                         /*
2647                          * Take mmu_setup_lock for mutual exclusion
2648                          * with kvmppc_update_lpcr().
2649                          */
2650                         err = -ENOMEM;
2651                         vcore = kvmppc_vcore_create(kvm,
2652                                         id & ~(kvm->arch.smt_mode - 1));
2653                         mutex_lock(&kvm->arch.mmu_setup_lock);
2654                         kvm->arch.vcores[core] = vcore;
2655                         kvm->arch.online_vcores++;
2656                         mutex_unlock(&kvm->arch.mmu_setup_lock);
2657                 }
2658         }
2659         mutex_unlock(&kvm->lock);
2660
2661         if (!vcore)
2662                 return err;
2663
2664         spin_lock(&vcore->lock);
2665         ++vcore->num_threads;
2666         spin_unlock(&vcore->lock);
2667         vcpu->arch.vcore = vcore;
2668         vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
2669         vcpu->arch.thread_cpu = -1;
2670         vcpu->arch.prev_cpu = -1;
2671
2672         vcpu->arch.cpu_type = KVM_CPU_3S_64;
2673         kvmppc_sanity_check(vcpu);
2674
2675         debugfs_vcpu_init(vcpu, id);
2676
2677         return 0;
2678 }
2679
2680 static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode,
2681                               unsigned long flags)
2682 {
2683         int err;
2684         int esmt = 0;
2685
2686         if (flags)
2687                 return -EINVAL;
2688         if (smt_mode > MAX_SMT_THREADS || !is_power_of_2(smt_mode))
2689                 return -EINVAL;
2690         if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
2691                 /*
2692                  * On POWER8 (or POWER7), the threading mode is "strict",
2693                  * so we pack smt_mode vcpus per vcore.
2694                  */
2695                 if (smt_mode > threads_per_subcore)
2696                         return -EINVAL;
2697         } else {
2698                 /*
2699                  * On POWER9, the threading mode is "loose",
2700                  * so each vcpu gets its own vcore.
2701                  */
2702                 esmt = smt_mode;
2703                 smt_mode = 1;
2704         }
2705         mutex_lock(&kvm->lock);
2706         err = -EBUSY;
2707         if (!kvm->arch.online_vcores) {
2708                 kvm->arch.smt_mode = smt_mode;
2709                 kvm->arch.emul_smt_mode = esmt;
2710                 err = 0;
2711         }
2712         mutex_unlock(&kvm->lock);
2713
2714         return err;
2715 }
2716
2717 static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
2718 {
2719         if (vpa->pinned_addr)
2720                 kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa,
2721                                         vpa->dirty);
2722 }
2723
2724 static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu)
2725 {
2726         spin_lock(&vcpu->arch.vpa_update_lock);
2727         unpin_vpa(vcpu->kvm, &vcpu->arch.dtl);
2728         unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
2729         unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
2730         spin_unlock(&vcpu->arch.vpa_update_lock);
2731 }
2732
2733 static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu)
2734 {
2735         /* Indicate we want to get back into the guest */
2736         return 1;
2737 }
2738
2739 static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
2740 {
2741         unsigned long dec_nsec, now;
2742
2743         now = get_tb();
2744         if (now > vcpu->arch.dec_expires) {
2745                 /* decrementer has already gone negative */
2746                 kvmppc_core_queue_dec(vcpu);
2747                 kvmppc_core_prepare_to_enter(vcpu);
2748                 return;
2749         }
2750         dec_nsec = tb_to_ns(vcpu->arch.dec_expires - now);
2751         hrtimer_start(&vcpu->arch.dec_timer, dec_nsec, HRTIMER_MODE_REL);
2752         vcpu->arch.timer_running = 1;
2753 }
2754
2755 extern int __kvmppc_vcore_entry(void);
2756
2757 static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
2758                                    struct kvm_vcpu *vcpu)
2759 {
2760         u64 now;
2761
2762         if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
2763                 return;
2764         spin_lock_irq(&vcpu->arch.tbacct_lock);
2765         now = mftb();
2766         vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) -
2767                 vcpu->arch.stolen_logged;
2768         vcpu->arch.busy_preempt = now;
2769         vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
2770         spin_unlock_irq(&vcpu->arch.tbacct_lock);
2771         --vc->n_runnable;
2772         WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL);
2773 }
2774
2775 static int kvmppc_grab_hwthread(int cpu)
2776 {
2777         struct paca_struct *tpaca;
2778         long timeout = 10000;
2779
2780         tpaca = paca_ptrs[cpu];
2781
2782         /* Ensure the thread won't go into the kernel if it wakes */
2783         tpaca->kvm_hstate.kvm_vcpu = NULL;
2784         tpaca->kvm_hstate.kvm_vcore = NULL;
2785         tpaca->kvm_hstate.napping = 0;
2786         smp_wmb();
2787         tpaca->kvm_hstate.hwthread_req = 1;
2788
2789         /*
2790          * If the thread is already executing in the kernel (e.g. handling
2791          * a stray interrupt), wait for it to get back to nap mode.
2792          * The smp_mb() is to ensure that our setting of hwthread_req
2793          * is visible before we look at hwthread_state, so if this
2794          * races with the code at system_reset_pSeries and the thread
2795          * misses our setting of hwthread_req, we are sure to see its
2796          * setting of hwthread_state, and vice versa.
2797          */
2798         smp_mb();
2799         while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) {
2800                 if (--timeout <= 0) {
2801                         pr_err("KVM: couldn't grab cpu %d\n", cpu);
2802                         return -EBUSY;
2803                 }
2804                 udelay(1);
2805         }
2806         return 0;
2807 }
2808
2809 static void kvmppc_release_hwthread(int cpu)
2810 {
2811         struct paca_struct *tpaca;
2812
2813         tpaca = paca_ptrs[cpu];
2814         tpaca->kvm_hstate.hwthread_req = 0;
2815         tpaca->kvm_hstate.kvm_vcpu = NULL;
2816         tpaca->kvm_hstate.kvm_vcore = NULL;
2817         tpaca->kvm_hstate.kvm_split_mode = NULL;
2818 }
2819
2820 static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
2821 {
2822         struct kvm_nested_guest *nested = vcpu->arch.nested;
2823         cpumask_t *cpu_in_guest;
2824         int i;
2825
2826         cpu = cpu_first_thread_sibling(cpu);
2827         if (nested) {
2828                 cpumask_set_cpu(cpu, &nested->need_tlb_flush);
2829                 cpu_in_guest = &nested->cpu_in_guest;
2830         } else {
2831                 cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
2832                 cpu_in_guest = &kvm->arch.cpu_in_guest;
2833         }
2834         /*
2835          * Make sure setting of bit in need_tlb_flush precedes
2836          * testing of cpu_in_guest bits.  The matching barrier on
2837          * the other side is the first smp_mb() in kvmppc_run_core().
2838          */
2839         smp_mb();
2840         for (i = 0; i < threads_per_core; ++i)
2841                 if (cpumask_test_cpu(cpu + i, cpu_in_guest))
2842                         smp_call_function_single(cpu + i, do_nothing, NULL, 1);
2843 }
2844
2845 static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
2846 {
2847         struct kvm_nested_guest *nested = vcpu->arch.nested;
2848         struct kvm *kvm = vcpu->kvm;
2849         int prev_cpu;
2850
2851         if (!cpu_has_feature(CPU_FTR_HVMODE))
2852                 return;
2853
2854         if (nested)
2855                 prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id];
2856         else
2857                 prev_cpu = vcpu->arch.prev_cpu;
2858
2859         /*
2860          * With radix, the guest can do TLB invalidations itself,
2861          * and it could choose to use the local form (tlbiel) if
2862          * it is invalidating a translation that has only ever been
2863          * used on one vcpu.  However, that doesn't mean it has
2864          * only ever been used on one physical cpu, since vcpus
2865          * can move around between pcpus.  To cope with this, when
2866          * a vcpu moves from one pcpu to another, we need to tell
2867          * any vcpus running on the same core as this vcpu previously
2868          * ran to flush the TLB.  The TLB is shared between threads,
2869          * so we use a single bit in .need_tlb_flush for all 4 threads.
2870          */
2871         if (prev_cpu != pcpu) {
2872                 if (prev_cpu >= 0 &&
2873                     cpu_first_thread_sibling(prev_cpu) !=
2874                     cpu_first_thread_sibling(pcpu))
2875                         radix_flush_cpu(kvm, prev_cpu, vcpu);
2876                 if (nested)
2877                         nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
2878                 else
2879                         vcpu->arch.prev_cpu = pcpu;
2880         }
2881 }
2882
2883 static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
2884 {
2885         int cpu;
2886         struct paca_struct *tpaca;
2887         struct kvm *kvm = vc->kvm;
2888
2889         cpu = vc->pcpu;
2890         if (vcpu) {
2891                 if (vcpu->arch.timer_running) {
2892                         hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
2893                         vcpu->arch.timer_running = 0;
2894                 }
2895                 cpu += vcpu->arch.ptid;
2896                 vcpu->cpu = vc->pcpu;
2897                 vcpu->arch.thread_cpu = cpu;
2898                 cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest);
2899         }
2900         tpaca = paca_ptrs[cpu];
2901         tpaca->kvm_hstate.kvm_vcpu = vcpu;
2902         tpaca->kvm_hstate.ptid = cpu - vc->pcpu;
2903         tpaca->kvm_hstate.fake_suspend = 0;
2904         /* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */
2905         smp_wmb();
2906         tpaca->kvm_hstate.kvm_vcore = vc;
2907         if (cpu != smp_processor_id())
2908                 kvmppc_ipi_thread(cpu);
2909 }
2910
2911 static void kvmppc_wait_for_nap(int n_threads)
2912 {
2913         int cpu = smp_processor_id();
2914         int i, loops;
2915
2916         if (n_threads <= 1)
2917                 return;
2918         for (loops = 0; loops < 1000000; ++loops) {
2919                 /*
2920                  * Check if all threads are finished.
2921                  * We set the vcore pointer when starting a thread
2922                  * and the thread clears it when finished, so we look
2923                  * for any threads that still have a non-NULL vcore ptr.
2924                  */
2925                 for (i = 1; i < n_threads; ++i)
2926                         if (paca_ptrs[cpu + i]->kvm_hstate.kvm_vcore)
2927                                 break;
2928                 if (i == n_threads) {
2929                         HMT_medium();
2930                         return;
2931                 }
2932                 HMT_low();
2933         }
2934         HMT_medium();
2935         for (i = 1; i < n_threads; ++i)
2936                 if (paca_ptrs[cpu + i]->kvm_hstate.kvm_vcore)
2937                         pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
2938 }
2939
2940 /*
2941  * Check that we are on thread 0 and that any other threads in
2942  * this core are off-line.  Then grab the threads so they can't
2943  * enter the kernel.
2944  */
2945 static int on_primary_thread(void)
2946 {
2947         int cpu = smp_processor_id();
2948         int thr;
2949
2950         /* Are we on a primary subcore? */
2951         if (cpu_thread_in_subcore(cpu))
2952                 return 0;
2953
2954         thr = 0;
2955         while (++thr < threads_per_subcore)
2956                 if (cpu_online(cpu + thr))
2957                         return 0;
2958
2959         /* Grab all hw threads so they can't go into the kernel */
2960         for (thr = 1; thr < threads_per_subcore; ++thr) {
2961                 if (kvmppc_grab_hwthread(cpu + thr)) {
2962                         /* Couldn't grab one; let the others go */
2963                         do {
2964                                 kvmppc_release_hwthread(cpu + thr);
2965                         } while (--thr > 0);
2966                         return 0;
2967                 }
2968         }
2969         return 1;
2970 }
2971
2972 /*
2973  * A list of virtual cores for each physical CPU.
2974  * These are vcores that could run but their runner VCPU tasks are
2975  * (or may be) preempted.
2976  */
2977 struct preempted_vcore_list {
2978         struct list_head        list;
2979         spinlock_t              lock;
2980 };
2981
2982 static DEFINE_PER_CPU(struct preempted_vcore_list, preempted_vcores);
2983
2984 static void init_vcore_lists(void)
2985 {
2986         int cpu;
2987
2988         for_each_possible_cpu(cpu) {
2989                 struct preempted_vcore_list *lp = &per_cpu(preempted_vcores, cpu);
2990                 spin_lock_init(&lp->lock);
2991                 INIT_LIST_HEAD(&lp->list);
2992         }
2993 }
2994
2995 static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
2996 {
2997         struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
2998
2999         vc->vcore_state = VCORE_PREEMPT;
3000         vc->pcpu = smp_processor_id();
3001         if (vc->num_threads < threads_per_vcore(vc->kvm)) {
3002                 spin_lock(&lp->lock);
3003                 list_add_tail(&vc->preempt_list, &lp->list);
3004                 spin_unlock(&lp->lock);
3005         }
3006
3007         /* Start accumulating stolen time */
3008         kvmppc_core_start_stolen(vc);
3009 }
3010
3011 static void kvmppc_vcore_end_preempt(struct kvmppc_vcore *vc)
3012 {
3013         struct preempted_vcore_list *lp;
3014
3015         kvmppc_core_end_stolen(vc);
3016         if (!list_empty(&vc->preempt_list)) {
3017                 lp = &per_cpu(preempted_vcores, vc->pcpu);
3018                 spin_lock(&lp->lock);
3019                 list_del_init(&vc->preempt_list);
3020                 spin_unlock(&lp->lock);
3021         }
3022         vc->vcore_state = VCORE_INACTIVE;
3023 }
3024
3025 /*
3026  * This stores information about the virtual cores currently
3027  * assigned to a physical core.
3028  */
3029 struct core_info {
3030         int             n_subcores;
3031         int             max_subcore_threads;
3032         int             total_threads;
3033         int             subcore_threads[MAX_SUBCORES];
3034         struct kvmppc_vcore *vc[MAX_SUBCORES];
3035 };
3036
3037 /*
3038  * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
3039  * respectively in 2-way micro-threading (split-core) mode on POWER8.
3040  */
3041 static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
3042
3043 static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
3044 {
3045         memset(cip, 0, sizeof(*cip));
3046         cip->n_subcores = 1;
3047         cip->max_subcore_threads = vc->num_threads;
3048         cip->total_threads = vc->num_threads;
3049         cip->subcore_threads[0] = vc->num_threads;
3050         cip->vc[0] = vc;
3051 }
3052
3053 static bool subcore_config_ok(int n_subcores, int n_threads)
3054 {
3055         /*
3056          * POWER9 "SMT4" cores are permanently in what is effectively a 4-way
3057          * split-core mode, with one thread per subcore.
3058          */
3059         if (cpu_has_feature(CPU_FTR_ARCH_300))
3060                 return n_subcores <= 4 && n_threads == 1;
3061
3062         /* On POWER8, can only dynamically split if unsplit to begin with */
3063         if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS)
3064                 return false;
3065         if (n_subcores > MAX_SUBCORES)
3066                 return false;
3067         if (n_subcores > 1) {
3068                 if (!(dynamic_mt_modes & 2))
3069                         n_subcores = 4;
3070                 if (n_subcores > 2 && !(dynamic_mt_modes & 4))
3071                         return false;
3072         }
3073
3074         return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS;
3075 }
3076
3077 static void init_vcore_to_run(struct kvmppc_vcore *vc)
3078 {
3079         vc->entry_exit_map = 0;
3080         vc->in_guest = 0;
3081         vc->napping_threads = 0;
3082         vc->conferring_threads = 0;
3083         vc->tb_offset_applied = 0;
3084 }
3085
3086 static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
3087 {
3088         int n_threads = vc->num_threads;
3089         int sub;
3090
3091         if (!cpu_has_feature(CPU_FTR_ARCH_207S))
3092                 return false;
3093
3094         /* In one_vm_per_core mode, require all vcores to be from the same vm */
3095         if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm)
3096                 return false;
3097
3098         if (n_threads < cip->max_subcore_threads)
3099                 n_threads = cip->max_subcore_threads;
3100         if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
3101                 return false;
3102         cip->max_subcore_threads = n_threads;
3103
3104         sub = cip->n_subcores;
3105         ++cip->n_subcores;
3106         cip->total_threads += vc->num_threads;
3107         cip->subcore_threads[sub] = vc->num_threads;
3108         cip->vc[sub] = vc;
3109         init_vcore_to_run(vc);
3110         list_del_init(&vc->preempt_list);
3111
3112         return true;
3113 }
3114
3115 /*
3116  * Work out whether it is possible to piggyback the execution of
3117  * vcore *pvc onto the execution of the other vcores described in *cip.
3118  */
3119 static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip,
3120                           int target_threads)
3121 {
3122         if (cip->total_threads + pvc->num_threads > target_threads)
3123                 return false;
3124
3125         return can_dynamic_split(pvc, cip);
3126 }
3127
3128 static void prepare_threads(struct kvmppc_vcore *vc)
3129 {
3130         int i;
3131         struct kvm_vcpu *vcpu;
3132
3133         for_each_runnable_thread(i, vcpu, vc) {
3134                 if (signal_pending(vcpu->arch.run_task))
3135                         vcpu->arch.ret = -EINTR;
3136                 else if (no_mixing_hpt_and_radix &&
3137                          kvm_is_radix(vc->kvm) != radix_enabled())
3138                         vcpu->arch.ret = -EINVAL;
3139                 else if (vcpu->arch.vpa.update_pending ||
3140                          vcpu->arch.slb_shadow.update_pending ||
3141                          vcpu->arch.dtl.update_pending)
3142                         vcpu->arch.ret = RESUME_GUEST;
3143                 else
3144                         continue;
3145                 kvmppc_remove_runnable(vc, vcpu);
3146                 wake_up(&vcpu->arch.cpu_run);
3147         }
3148 }
3149
3150 static void collect_piggybacks(struct core_info *cip, int target_threads)
3151 {
3152         struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
3153         struct kvmppc_vcore *pvc, *vcnext;
3154
3155         spin_lock(&lp->lock);
3156         list_for_each_entry_safe(pvc, vcnext, &lp->list, preempt_list) {
3157                 if (!spin_trylock(&pvc->lock))
3158                         continue;
3159                 prepare_threads(pvc);
3160                 if (!pvc->n_runnable || !pvc->kvm->arch.mmu_ready) {
3161                         list_del_init(&pvc->preempt_list);
3162                         if (pvc->runner == NULL) {
3163                                 pvc->vcore_state = VCORE_INACTIVE;
3164                                 kvmppc_core_end_stolen(pvc);
3165                         }
3166                         spin_unlock(&pvc->lock);
3167                         continue;
3168                 }
3169                 if (!can_piggyback(pvc, cip, target_threads)) {
3170                         spin_unlock(&pvc->lock);
3171                         continue;
3172                 }
3173                 kvmppc_core_end_stolen(pvc);
3174                 pvc->vcore_state = VCORE_PIGGYBACK;
3175                 if (cip->total_threads >= target_threads)
3176                         break;
3177         }
3178         spin_unlock(&lp->lock);
3179 }
3180
3181 static bool recheck_signals_and_mmu(struct core_info *cip)
3182 {
3183         int sub, i;
3184         struct kvm_vcpu *vcpu;
3185         struct kvmppc_vcore *vc;
3186
3187         for (sub = 0; sub < cip->n_subcores; ++sub) {
3188                 vc = cip->vc[sub];
3189                 if (!vc->kvm->arch.mmu_ready)
3190                         return true;
3191                 for_each_runnable_thread(i, vcpu, vc)
3192                         if (signal_pending(vcpu->arch.run_task))
3193                                 return true;
3194         }
3195         return false;
3196 }
3197
3198 static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
3199 {
3200         int still_running = 0, i;
3201         u64 now;
3202         long ret;
3203         struct kvm_vcpu *vcpu;
3204
3205         spin_lock(&vc->lock);
3206         now = get_tb();
3207         for_each_runnable_thread(i, vcpu, vc) {
3208                 /*
3209                  * It's safe to unlock the vcore in the loop here, because
3210                  * for_each_runnable_thread() is safe against removal of
3211                  * the vcpu, and the vcore state is VCORE_EXITING here,
3212                  * so any vcpus becoming runnable will have their arch.trap
3213                  * set to zero and can't actually run in the guest.
3214                  */
3215                 spin_unlock(&vc->lock);
3216                 /* cancel pending dec exception if dec is positive */
3217                 if (now < vcpu->arch.dec_expires &&
3218                     kvmppc_core_pending_dec(vcpu))
3219                         kvmppc_core_dequeue_dec(vcpu);
3220
3221                 trace_kvm_guest_exit(vcpu);
3222
3223                 ret = RESUME_GUEST;
3224                 if (vcpu->arch.trap)
3225                         ret = kvmppc_handle_exit_hv(vcpu,
3226                                                     vcpu->arch.run_task);
3227
3228                 vcpu->arch.ret = ret;
3229                 vcpu->arch.trap = 0;
3230
3231                 spin_lock(&vc->lock);
3232                 if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
3233                         if (vcpu->arch.pending_exceptions)
3234                                 kvmppc_core_prepare_to_enter(vcpu);
3235                         if (vcpu->arch.ceded)
3236                                 kvmppc_set_timer(vcpu);
3237                         else
3238                                 ++still_running;
3239                 } else {
3240                         kvmppc_remove_runnable(vc, vcpu);
3241                         wake_up(&vcpu->arch.cpu_run);
3242                 }
3243         }
3244         if (!is_master) {
3245                 if (still_running > 0) {
3246                         kvmppc_vcore_preempt(vc);
3247                 } else if (vc->runner) {
3248                         vc->vcore_state = VCORE_PREEMPT;
3249                         kvmppc_core_start_stolen(vc);
3250                 } else {
3251                         vc->vcore_state = VCORE_INACTIVE;
3252                 }
3253                 if (vc->n_runnable > 0 && vc->runner == NULL) {
3254                         /* make sure there's a candidate runner awake */
3255                         i = -1;
3256                         vcpu = next_runnable_thread(vc, &i);
3257                         wake_up(&vcpu->arch.cpu_run);
3258                 }
3259         }
3260         spin_unlock(&vc->lock);
3261 }
3262
3263 /*
3264  * Clear core from the list of active host cores as we are about to
3265  * enter the guest. Only do this if it is the primary thread of the
3266  * core (not if a subcore) that is entering the guest.
3267  */
3268 static inline int kvmppc_clear_host_core(unsigned int cpu)
3269 {
3270         int core;
3271
3272         if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
3273                 return 0;
3274         /*
3275          * Memory barrier can be omitted here as we will do a smp_wmb()
3276          * later in kvmppc_start_thread and we need ensure that state is
3277          * visible to other CPUs only after we enter guest.
3278          */
3279         core = cpu >> threads_shift;
3280         kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0;
3281         return 0;
3282 }
3283
3284 /*
3285  * Advertise this core as an active host core since we exited the guest
3286  * Only need to do this if it is the primary thread of the core that is
3287  * exiting.
3288  */
3289 static inline int kvmppc_set_host_core(unsigned int cpu)
3290 {
3291         int core;
3292
3293         if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
3294                 return 0;
3295
3296         /*
3297          * Memory barrier can be omitted here because we do a spin_unlock
3298          * immediately after this which provides the memory barrier.
3299          */
3300         core = cpu >> threads_shift;
3301         kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1;
3302         return 0;
3303 }
3304
3305 static void set_irq_happened(int trap)
3306 {
3307         switch (trap) {
3308         case BOOK3S_INTERRUPT_EXTERNAL:
3309                 local_paca->irq_happened |= PACA_IRQ_EE;
3310                 break;
3311         case BOOK3S_INTERRUPT_H_DOORBELL:
3312                 local_paca->irq_happened |= PACA_IRQ_DBELL;
3313                 break;
3314         case BOOK3S_INTERRUPT_HMI:
3315                 local_paca->irq_happened |= PACA_IRQ_HMI;
3316                 break;
3317         case BOOK3S_INTERRUPT_SYSTEM_RESET:
3318                 replay_system_reset();
3319                 break;
3320         }
3321 }
3322
3323 /*
3324  * Run a set of guest threads on a physical core.
3325  * Called with vc->lock held.
3326  */
3327 static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
3328 {
3329         struct kvm_vcpu *vcpu;
3330         int i;
3331         int srcu_idx;
3332         struct core_info core_info;
3333         struct kvmppc_vcore *pvc;
3334         struct kvm_split_mode split_info, *sip;
3335         int split, subcore_size, active;
3336         int sub;
3337         bool thr0_done;
3338         unsigned long cmd_bit, stat_bit;
3339         int pcpu, thr;
3340         int target_threads;
3341         int controlled_threads;
3342         int trap;
3343         bool is_power8;
3344
3345         /*
3346          * Remove from the list any threads that have a signal pending
3347          * or need a VPA update done
3348          */
3349         prepare_threads(vc);
3350
3351         /* if the runner is no longer runnable, let the caller pick a new one */
3352         if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE)
3353                 return;
3354
3355         /*
3356          * Initialize *vc.
3357          */
3358         init_vcore_to_run(vc);
3359         vc->preempt_tb = TB_NIL;
3360
3361         /*
3362          * Number of threads that we will be controlling: the same as
3363          * the number of threads per subcore, except on POWER9,
3364          * where it's 1 because the threads are (mostly) independent.
3365          */
3366         controlled_threads = threads_per_vcore(vc->kvm);
3367
3368         /*
3369          * Make sure we are running on primary threads, and that secondary
3370          * threads are offline.  Also check if the number of threads in this
3371          * guest are greater than the current system threads per guest.
3372          * On POWER9, we need to be not in independent-threads mode if
3373          * this is a HPT guest on a radix host machine where the
3374          * CPU threads may not be in different MMU modes.
3375          */
3376         if ((controlled_threads > 1) &&
3377             ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
3378                 for_each_runnable_thread(i, vcpu, vc) {
3379                         vcpu->arch.ret = -EBUSY;
3380                         kvmppc_remove_runnable(vc, vcpu);
3381                         wake_up(&vcpu->arch.cpu_run);
3382                 }
3383                 goto out;
3384         }
3385
3386         /*
3387          * See if we could run any other vcores on the physical core
3388          * along with this one.
3389          */
3390         init_core_info(&core_info, vc);
3391         pcpu = smp_processor_id();
3392         target_threads = controlled_threads;
3393         if (target_smt_mode && target_smt_mode < target_threads)
3394                 target_threads = target_smt_mode;
3395         if (vc->num_threads < target_threads)
3396                 collect_piggybacks(&core_info, target_threads);
3397
3398         /*
3399          * On radix, arrange for TLB flushing if necessary.
3400          * This has to be done before disabling interrupts since
3401          * it uses smp_call_function().
3402          */
3403         pcpu = smp_processor_id();
3404         if (kvm_is_radix(vc->kvm)) {
3405                 for (sub = 0; sub < core_info.n_subcores; ++sub)
3406                         for_each_runnable_thread(i, vcpu, core_info.vc[sub])
3407                                 kvmppc_prepare_radix_vcpu(vcpu, pcpu);
3408         }
3409
3410         /*
3411          * Hard-disable interrupts, and check resched flag and signals.
3412          * If we need to reschedule or deliver a signal, clean up
3413          * and return without going into the guest(s).
3414          * If the mmu_ready flag has been cleared, don't go into the
3415          * guest because that means a HPT resize operation is in progress.
3416          */
3417         local_irq_disable();
3418         hard_irq_disable();
3419         if (lazy_irq_pending() || need_resched() ||
3420             recheck_signals_and_mmu(&core_info)) {
3421                 local_irq_enable();
3422                 vc->vcore_state = VCORE_INACTIVE;
3423                 /* Unlock all except the primary vcore */
3424                 for (sub = 1; sub < core_info.n_subcores; ++sub) {
3425                         pvc = core_info.vc[sub];
3426                         /* Put back on to the preempted vcores list */
3427                         kvmppc_vcore_preempt(pvc);
3428                         spin_unlock(&pvc->lock);
3429                 }
3430                 for (i = 0; i < controlled_threads; ++i)
3431                         kvmppc_release_hwthread(pcpu + i);
3432                 return;
3433         }
3434
3435         kvmppc_clear_host_core(pcpu);
3436
3437         /* Decide on micro-threading (split-core) mode */
3438         subcore_size = threads_per_subcore;
3439         cmd_bit = stat_bit = 0;
3440         split = core_info.n_subcores;
3441         sip = NULL;
3442         is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S)
3443                 && !cpu_has_feature(CPU_FTR_ARCH_300);
3444
3445         if (split > 1) {
3446                 sip = &split_info;
3447                 memset(&split_info, 0, sizeof(split_info));
3448                 for (sub = 0; sub < core_info.n_subcores; ++sub)
3449                         split_info.vc[sub] = core_info.vc[sub];
3450
3451                 if (is_power8) {
3452                         if (split == 2 && (dynamic_mt_modes & 2)) {
3453                                 cmd_bit = HID0_POWER8_1TO2LPAR;
3454                                 stat_bit = HID0_POWER8_2LPARMODE;
3455                         } else {
3456                                 split = 4;
3457                                 cmd_bit = HID0_POWER8_1TO4LPAR;
3458                                 stat_bit = HID0_POWER8_4LPARMODE;
3459                         }
3460                         subcore_size = MAX_SMT_THREADS / split;
3461                         split_info.rpr = mfspr(SPRN_RPR);
3462                         split_info.pmmar = mfspr(SPRN_PMMAR);
3463                         split_info.ldbar = mfspr(SPRN_LDBAR);
3464                         split_info.subcore_size = subcore_size;
3465                 } else {
3466                         split_info.subcore_size = 1;
3467                 }
3468
3469                 /* order writes to split_info before kvm_split_mode pointer */
3470                 smp_wmb();
3471         }
3472
3473         for (thr = 0; thr < controlled_threads; ++thr) {
3474                 struct paca_struct *paca = paca_ptrs[pcpu + thr];
3475
3476                 paca->kvm_hstate.napping = 0;
3477                 paca->kvm_hstate.kvm_split_mode = sip;
3478         }
3479
3480         /* Initiate micro-threading (split-core) on POWER8 if required */
3481         if (cmd_bit) {
3482                 unsigned long hid0 = mfspr(SPRN_HID0);
3483
3484                 hid0 |= cmd_bit | HID0_POWER8_DYNLPARDIS;
3485                 mb();
3486                 mtspr(SPRN_HID0, hid0);
3487                 isync();
3488                 for (;;) {
3489                         hid0 = mfspr(SPRN_HID0);
3490                         if (hid0 & stat_bit)
3491                                 break;
3492                         cpu_relax();
3493                 }
3494         }
3495
3496         /*
3497          * On POWER8, set RWMR register.
3498          * Since it only affects PURR and SPURR, it doesn't affect
3499          * the host, so we don't save/restore the host value.
3500          */
3501         if (is_power8) {
3502                 unsigned long rwmr_val = RWMR_RPA_P8_8THREAD;
3503                 int n_online = atomic_read(&vc->online_count);
3504
3505                 /*
3506                  * Use the 8-thread value if we're doing split-core
3507                  * or if the vcore's online count looks bogus.
3508                  */
3509                 if (split == 1 && threads_per_subcore == MAX_SMT_THREADS &&
3510                     n_online >= 1 && n_online <= MAX_SMT_THREADS)
3511                         rwmr_val = p8_rwmr_values[n_online];
3512                 mtspr(SPRN_RWMR, rwmr_val);
3513         }
3514
3515         /* Start all the threads */
3516         active = 0;
3517         for (sub = 0; sub < core_info.n_subcores; ++sub) {
3518                 thr = is_power8 ? subcore_thread_map[sub] : sub;
3519                 thr0_done = false;
3520                 active |= 1 << thr;
3521                 pvc = core_info.vc[sub];
3522                 pvc->pcpu = pcpu + thr;
3523                 for_each_runnable_thread(i, vcpu, pvc) {
3524                         kvmppc_start_thread(vcpu, pvc);
3525                         kvmppc_create_dtl_entry(vcpu, pvc);
3526                         trace_kvm_guest_enter(vcpu);
3527                         if (!vcpu->arch.ptid)
3528                                 thr0_done = true;
3529                         active |= 1 << (thr + vcpu->arch.ptid);
3530                 }
3531                 /*
3532                  * We need to start the first thread of each subcore
3533                  * even if it doesn't have a vcpu.
3534                  */
3535                 if (!thr0_done)
3536                         kvmppc_start_thread(NULL, pvc);
3537         }
3538
3539         /*
3540          * Ensure that split_info.do_nap is set after setting
3541          * the vcore pointer in the PACA of the secondaries.
3542          */
3543         smp_mb();
3544
3545         /*
3546          * When doing micro-threading, poke the inactive threads as well.
3547          * This gets them to the nap instruction after kvm_do_nap,
3548          * which reduces the time taken to unsplit later.
3549          */
3550         if (cmd_bit) {
3551                 split_info.do_nap = 1;  /* ask secondaries to nap when done */
3552                 for (thr = 1; thr < threads_per_subcore; ++thr)
3553                         if (!(active & (1 << thr)))
3554                                 kvmppc_ipi_thread(pcpu + thr);
3555         }
3556
3557         vc->vcore_state = VCORE_RUNNING;
3558         preempt_disable();
3559
3560         trace_kvmppc_run_core(vc, 0);
3561
3562         for (sub = 0; sub < core_info.n_subcores; ++sub)
3563                 spin_unlock(&core_info.vc[sub]->lock);
3564
3565         guest_enter_irqoff();
3566
3567         srcu_idx = srcu_read_lock(&vc->kvm->srcu);
3568
3569         this_cpu_disable_ftrace();
3570
3571         /*
3572          * Interrupts will be enabled once we get into the guest,
3573          * so tell lockdep that we're about to enable interrupts.
3574          */
3575         trace_hardirqs_on();
3576
3577         trap = __kvmppc_vcore_entry();
3578
3579         trace_hardirqs_off();
3580
3581         this_cpu_enable_ftrace();
3582
3583         srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
3584
3585         set_irq_happened(trap);
3586
3587         spin_lock(&vc->lock);
3588         /* prevent other vcpu threads from doing kvmppc_start_thread() now */
3589         vc->vcore_state = VCORE_EXITING;
3590
3591         /* wait for secondary threads to finish writing their state to memory */
3592         kvmppc_wait_for_nap(controlled_threads);
3593
3594         /* Return to whole-core mode if we split the core earlier */
3595         if (cmd_bit) {
3596                 unsigned long hid0 = mfspr(SPRN_HID0);
3597                 unsigned long loops = 0;
3598
3599                 hid0 &= ~HID0_POWER8_DYNLPARDIS;
3600                 stat_bit = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE;
3601                 mb();
3602                 mtspr(SPRN_HID0, hid0);
3603                 isync();
3604                 for (;;) {
3605                         hid0 = mfspr(SPRN_HID0);
3606                         if (!(hid0 & stat_bit))
3607                                 break;
3608                         cpu_relax();
3609                         ++loops;
3610                 }
3611                 split_info.do_nap = 0;
3612         }
3613
3614         kvmppc_set_host_core(pcpu);
3615
3616         guest_exit_irqoff();
3617
3618         local_irq_enable();
3619
3620         /* Let secondaries go back to the offline loop */
3621         for (i = 0; i < controlled_threads; ++i) {
3622                 kvmppc_release_hwthread(pcpu + i);
3623                 if (sip && sip->napped[i])
3624                         kvmppc_ipi_thread(pcpu + i);
3625                 cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest);
3626         }
3627
3628         spin_unlock(&vc->lock);
3629
3630         /* make sure updates to secondary vcpu structs are visible now */
3631         smp_mb();
3632
3633         preempt_enable();
3634
3635         for (sub = 0; sub < core_info.n_subcores; ++sub) {
3636                 pvc = core_info.vc[sub];
3637                 post_guest_process(pvc, pvc == vc);
3638         }
3639
3640         spin_lock(&vc->lock);
3641
3642  out:
3643         vc->vcore_state = VCORE_INACTIVE;
3644         trace_kvmppc_run_core(vc, 1);
3645 }
3646
3647 static void load_spr_state(struct kvm_vcpu *vcpu)
3648 {
3649         mtspr(SPRN_DSCR, vcpu->arch.dscr);
3650         mtspr(SPRN_IAMR, vcpu->arch.iamr);
3651         mtspr(SPRN_PSPB, vcpu->arch.pspb);
3652         mtspr(SPRN_FSCR, vcpu->arch.fscr);
3653         mtspr(SPRN_TAR, vcpu->arch.tar);
3654         mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
3655         mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
3656         mtspr(SPRN_BESCR, vcpu->arch.bescr);
3657         mtspr(SPRN_WORT, vcpu->arch.wort);
3658         mtspr(SPRN_TIDR, vcpu->arch.tid);
3659         mtspr(SPRN_AMR, vcpu->arch.amr);
3660         mtspr(SPRN_UAMOR, vcpu->arch.uamor);
3661
3662         /*
3663          * DAR, DSISR, and for nested HV, SPRGs must be set with MSR[RI]
3664          * clear (or hstate set appropriately to catch those registers
3665          * being clobbered if we take a MCE or SRESET), so those are done
3666          * later.
3667          */
3668
3669         if (!(vcpu->arch.ctrl & 1))
3670                 mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1);
3671 }
3672
3673 static void store_spr_state(struct kvm_vcpu *vcpu)
3674 {
3675         vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
3676
3677         vcpu->arch.iamr = mfspr(SPRN_IAMR);
3678         vcpu->arch.pspb = mfspr(SPRN_PSPB);
3679         vcpu->arch.fscr = mfspr(SPRN_FSCR);
3680         vcpu->arch.tar = mfspr(SPRN_TAR);
3681         vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
3682         vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
3683         vcpu->arch.bescr = mfspr(SPRN_BESCR);
3684         vcpu->arch.wort = mfspr(SPRN_WORT);
3685         vcpu->arch.tid = mfspr(SPRN_TIDR);
3686         vcpu->arch.amr = mfspr(SPRN_AMR);
3687         vcpu->arch.uamor = mfspr(SPRN_UAMOR);
3688         vcpu->arch.dscr = mfspr(SPRN_DSCR);
3689 }
3690
3691 /*
3692  * Privileged (non-hypervisor) host registers to save.
3693  */
3694 struct p9_host_os_sprs {
3695         unsigned long dscr;
3696         unsigned long tidr;
3697         unsigned long iamr;
3698         unsigned long amr;
3699         unsigned long fscr;
3700 };
3701
3702 static void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs)
3703 {
3704         host_os_sprs->dscr = mfspr(SPRN_DSCR);
3705         host_os_sprs->tidr = mfspr(SPRN_TIDR);
3706         host_os_sprs->iamr = mfspr(SPRN_IAMR);
3707         host_os_sprs->amr = mfspr(SPRN_AMR);
3708         host_os_sprs->fscr = mfspr(SPRN_FSCR);
3709 }
3710
3711 /* vcpu guest regs must already be saved */
3712 static void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu,
3713                                     struct p9_host_os_sprs *host_os_sprs)
3714 {
3715         mtspr(SPRN_PSPB, 0);
3716         mtspr(SPRN_WORT, 0);
3717         mtspr(SPRN_UAMOR, 0);
3718
3719         mtspr(SPRN_DSCR, host_os_sprs->dscr);
3720         mtspr(SPRN_TIDR, host_os_sprs->tidr);
3721         mtspr(SPRN_IAMR, host_os_sprs->iamr);
3722
3723         if (host_os_sprs->amr != vcpu->arch.amr)
3724                 mtspr(SPRN_AMR, host_os_sprs->amr);
3725
3726         if (host_os_sprs->fscr != vcpu->arch.fscr)
3727                 mtspr(SPRN_FSCR, host_os_sprs->fscr);
3728
3729         /* Save guest CTRL register, set runlatch to 1 */
3730         if (!(vcpu->arch.ctrl & 1))
3731                 mtspr(SPRN_CTRLT, 1);
3732 }
3733
3734 static inline bool hcall_is_xics(unsigned long req)
3735 {
3736         return req == H_EOI || req == H_CPPR || req == H_IPI ||
3737                 req == H_IPOLL || req == H_XIRR || req == H_XIRR_X;
3738 }
3739
3740 /*
3741  * Virtual-mode guest entry for POWER9 and later when the host and
3742  * guest are both using the radix MMU.  The LPIDR has already been set.
3743  */
3744 static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
3745                          unsigned long lpcr)
3746 {
3747         struct kvmppc_vcore *vc = vcpu->arch.vcore;
3748         struct p9_host_os_sprs host_os_sprs;
3749         s64 dec;
3750         u64 tb;
3751         int trap, save_pmu;
3752
3753         WARN_ON_ONCE(vcpu->arch.ceded);
3754
3755         dec = mfspr(SPRN_DEC);
3756         tb = mftb();
3757         if (dec < 0)
3758                 return BOOK3S_INTERRUPT_HV_DECREMENTER;
3759         local_paca->kvm_hstate.dec_expires = dec + tb;
3760         if (local_paca->kvm_hstate.dec_expires < time_limit)
3761                 time_limit = local_paca->kvm_hstate.dec_expires;
3762
3763         save_p9_host_os_sprs(&host_os_sprs);
3764
3765         kvmhv_save_host_pmu();          /* saves it to PACA kvm_hstate */
3766
3767         kvmppc_subcore_enter_guest();
3768
3769         vc->entry_exit_map = 1;
3770         vc->in_guest = 1;
3771
3772         if (vcpu->arch.vpa.pinned_addr) {
3773                 struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
3774                 u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
3775                 lp->yield_count = cpu_to_be32(yield_count);
3776                 vcpu->arch.vpa.dirty = 1;
3777         }
3778
3779         if (cpu_has_feature(CPU_FTR_TM) ||
3780             cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
3781                 kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
3782
3783         kvmhv_load_guest_pmu(vcpu);
3784
3785         msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
3786         load_fp_state(&vcpu->arch.fp);
3787 #ifdef CONFIG_ALTIVEC
3788         load_vr_state(&vcpu->arch.vr);
3789 #endif
3790         mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
3791
3792         load_spr_state(vcpu);
3793
3794         /*
3795          * When setting DEC, we must always deal with irq_work_raise via NMI vs
3796          * setting DEC. The problem occurs right as we switch into guest mode
3797          * if a NMI hits and sets pending work and sets DEC, then that will
3798          * apply to the guest and not bring us back to the host.
3799          *
3800          * irq_work_raise could check a flag (or possibly LPCR[HDICE] for
3801          * example) and set HDEC to 1? That wouldn't solve the nested hv
3802          * case which needs to abort the hcall or zero the time limit.
3803          *
3804          * XXX: Another day's problem.
3805          */
3806         mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb());
3807
3808         if (kvmhv_on_pseries()) {
3809                 /*
3810                  * We need to save and restore the guest visible part of the
3811                  * psscr (i.e. using SPRN_PSSCR_PR) since the hypervisor
3812                  * doesn't do this for us. Note only required if pseries since
3813                  * this is done in kvmhv_vcpu_entry_p9() below otherwise.
3814                  */
3815                 unsigned long host_psscr;
3816                 /* call our hypervisor to load up HV regs and go */
3817                 struct hv_guest_state hvregs;
3818
3819                 host_psscr = mfspr(SPRN_PSSCR_PR);
3820                 mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr);
3821                 kvmhv_save_hv_regs(vcpu, &hvregs);
3822                 hvregs.lpcr = lpcr;
3823                 vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
3824                 hvregs.version = HV_GUEST_STATE_VERSION;
3825                 if (vcpu->arch.nested) {
3826                         hvregs.lpid = vcpu->arch.nested->shadow_lpid;
3827                         hvregs.vcpu_token = vcpu->arch.nested_vcpu_id;
3828                 } else {
3829                         hvregs.lpid = vcpu->kvm->arch.lpid;
3830                         hvregs.vcpu_token = vcpu->vcpu_id;
3831                 }
3832                 hvregs.hdec_expiry = time_limit;
3833                 mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
3834                 mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
3835                 trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
3836                                           __pa(&vcpu->arch.regs));
3837                 kvmhv_restore_hv_return_state(vcpu, &hvregs);
3838                 vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
3839                 vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
3840                 vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
3841                 vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR);
3842                 mtspr(SPRN_PSSCR_PR, host_psscr);
3843
3844                 /* H_CEDE has to be handled now, not later */
3845                 if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
3846                     kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
3847                         kvmppc_cede(vcpu);
3848                         kvmppc_set_gpr(vcpu, 3, 0);
3849                         trap = 0;
3850                 }
3851         } else {
3852                 kvmppc_xive_push_vcpu(vcpu);
3853                 trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr);
3854                 if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
3855                     !(vcpu->arch.shregs.msr & MSR_PR)) {
3856                         unsigned long req = kvmppc_get_gpr(vcpu, 3);
3857
3858                         /* H_CEDE has to be handled now, not later */
3859                         if (req == H_CEDE) {
3860                                 kvmppc_cede(vcpu);
3861                                 kvmppc_xive_rearm_escalation(vcpu); /* may un-cede */
3862                                 kvmppc_set_gpr(vcpu, 3, 0);
3863                                 trap = 0;
3864
3865                         /* XICS hcalls must be handled before xive is pulled */
3866                         } else if (hcall_is_xics(req)) {
3867                                 int ret;
3868
3869                                 ret = kvmppc_xive_xics_hcall(vcpu, req);
3870                                 if (ret != H_TOO_HARD) {
3871                                         kvmppc_set_gpr(vcpu, 3, ret);
3872                                         trap = 0;
3873                                 }
3874                         }
3875                 }
3876                 kvmppc_xive_pull_vcpu(vcpu);
3877
3878                 if (kvm_is_radix(vcpu->kvm))
3879                         vcpu->arch.slb_max = 0;
3880         }
3881
3882         dec = mfspr(SPRN_DEC);
3883         if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
3884                 dec = (s32) dec;
3885         tb = mftb();
3886         vcpu->arch.dec_expires = dec + tb;
3887         vcpu->cpu = -1;
3888         vcpu->arch.thread_cpu = -1;
3889
3890         store_spr_state(vcpu);
3891
3892         restore_p9_host_os_sprs(vcpu, &host_os_sprs);
3893
3894         msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
3895         store_fp_state(&vcpu->arch.fp);
3896 #ifdef CONFIG_ALTIVEC
3897         store_vr_state(&vcpu->arch.vr);
3898 #endif
3899         vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
3900
3901         if (cpu_has_feature(CPU_FTR_TM) ||
3902             cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
3903                 kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
3904
3905         save_pmu = 1;
3906         if (vcpu->arch.vpa.pinned_addr) {
3907                 struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
3908                 u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
3909                 lp->yield_count = cpu_to_be32(yield_count);
3910                 vcpu->arch.vpa.dirty = 1;
3911                 save_pmu = lp->pmcregs_in_use;
3912         }
3913         /* Must save pmu if this guest is capable of running nested guests */
3914         save_pmu |= nesting_enabled(vcpu->kvm);
3915
3916         kvmhv_save_guest_pmu(vcpu, save_pmu);
3917
3918         vc->entry_exit_map = 0x101;
3919         vc->in_guest = 0;
3920
3921         mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb());
3922         /* We may have raced with new irq work */
3923         if (test_irq_work_pending())
3924                 set_dec(1);
3925         mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso);
3926
3927         kvmhv_load_host_pmu();
3928
3929         kvmppc_subcore_exit_guest();
3930
3931         return trap;
3932 }
3933
3934 /*
3935  * Wait for some other vcpu thread to execute us, and
3936  * wake us up when we need to handle something in the host.
3937  */
3938 static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
3939                                  struct kvm_vcpu *vcpu, int wait_state)
3940 {
3941         DEFINE_WAIT(wait);
3942
3943         prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
3944         if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
3945                 spin_unlock(&vc->lock);
3946                 schedule();
3947                 spin_lock(&vc->lock);
3948         }
3949         finish_wait(&vcpu->arch.cpu_run, &wait);
3950 }
3951
3952 static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
3953 {
3954         if (!halt_poll_ns_grow)
3955                 return;
3956
3957         vc->halt_poll_ns *= halt_poll_ns_grow;
3958         if (vc->halt_poll_ns < halt_poll_ns_grow_start)
3959                 vc->halt_poll_ns = halt_poll_ns_grow_start;
3960 }
3961
3962 static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
3963 {
3964         if (halt_poll_ns_shrink == 0)
3965                 vc->halt_poll_ns = 0;
3966         else
3967                 vc->halt_poll_ns /= halt_poll_ns_shrink;
3968 }
3969
3970 #ifdef CONFIG_KVM_XICS
3971 static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
3972 {
3973         if (!xics_on_xive())
3974                 return false;
3975         return vcpu->arch.irq_pending || vcpu->arch.xive_saved_state.pipr <
3976                 vcpu->arch.xive_saved_state.cppr;
3977 }
3978 #else
3979 static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
3980 {
3981         return false;
3982 }
3983 #endif /* CONFIG_KVM_XICS */
3984
3985 static bool kvmppc_vcpu_woken(struct kvm_vcpu *vcpu)
3986 {
3987         if (vcpu->arch.pending_exceptions || vcpu->arch.prodded ||
3988             kvmppc_doorbell_pending(vcpu) || xive_interrupt_pending(vcpu))
3989                 return true;
3990
3991         return false;
3992 }
3993
3994 /*
3995  * Check to see if any of the runnable vcpus on the vcore have pending
3996  * exceptions or are no longer ceded
3997  */
3998 static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
3999 {
4000         struct kvm_vcpu *vcpu;
4001         int i;
4002
4003         for_each_runnable_thread(i, vcpu, vc) {
4004                 if (!vcpu->arch.ceded || kvmppc_vcpu_woken(vcpu))
4005                         return 1;
4006         }
4007
4008         return 0;
4009 }
4010
4011 /*
4012  * All the vcpus in this vcore are idle, so wait for a decrementer
4013  * or external interrupt to one of the vcpus.  vc->lock is held.
4014  */
4015 static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
4016 {
4017         ktime_t cur, start_poll, start_wait;
4018         int do_sleep = 1;
4019         u64 block_ns;
4020
4021         /* Poll for pending exceptions and ceded state */
4022         cur = start_poll = ktime_get();
4023         if (vc->halt_poll_ns) {
4024                 ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
4025                 ++vc->runner->stat.halt_attempted_poll;
4026
4027                 vc->vcore_state = VCORE_POLLING;
4028                 spin_unlock(&vc->lock);
4029
4030                 do {
4031                         if (kvmppc_vcore_check_block(vc)) {
4032                                 do_sleep = 0;
4033                                 break;
4034                         }
4035                         cur = ktime_get();
4036                 } while (single_task_running() && ktime_before(cur, stop));
4037
4038                 spin_lock(&vc->lock);
4039                 vc->vcore_state = VCORE_INACTIVE;
4040
4041                 if (!do_sleep) {
4042                         ++vc->runner->stat.halt_successful_poll;
4043                         goto out;
4044                 }
4045         }
4046
4047         prepare_to_rcuwait(&vc->wait);
4048         set_current_state(TASK_INTERRUPTIBLE);
4049         if (kvmppc_vcore_check_block(vc)) {
4050                 finish_rcuwait(&vc->wait);
4051                 do_sleep = 0;
4052                 /* If we polled, count this as a successful poll */
4053                 if (vc->halt_poll_ns)
4054                         ++vc->runner->stat.halt_successful_poll;
4055                 goto out;
4056         }
4057
4058         start_wait = ktime_get();
4059
4060         vc->vcore_state = VCORE_SLEEPING;
4061         trace_kvmppc_vcore_blocked(vc, 0);
4062         spin_unlock(&vc->lock);
4063         schedule();
4064         finish_rcuwait(&vc->wait);
4065         spin_lock(&vc->lock);
4066         vc->vcore_state = VCORE_INACTIVE;
4067         trace_kvmppc_vcore_blocked(vc, 1);
4068         ++vc->runner->stat.halt_successful_wait;
4069
4070         cur = ktime_get();
4071
4072 out:
4073         block_ns = ktime_to_ns(cur) - ktime_to_ns(start_poll);
4074
4075         /* Attribute wait time */
4076         if (do_sleep) {
4077                 vc->runner->stat.halt_wait_ns +=
4078                         ktime_to_ns(cur) - ktime_to_ns(start_wait);
4079                 /* Attribute failed poll time */
4080                 if (vc->halt_poll_ns)
4081                         vc->runner->stat.halt_poll_fail_ns +=
4082                                 ktime_to_ns(start_wait) -
4083                                 ktime_to_ns(start_poll);
4084         } else {
4085                 /* Attribute successful poll time */
4086                 if (vc->halt_poll_ns)
4087                         vc->runner->stat.halt_poll_success_ns +=
4088                                 ktime_to_ns(cur) -
4089                                 ktime_to_ns(start_poll);
4090         }
4091
4092         /* Adjust poll time */
4093         if (halt_poll_ns) {
4094                 if (block_ns <= vc->halt_poll_ns)
4095                         ;
4096                 /* We slept and blocked for longer than the max halt time */
4097                 else if (vc->halt_poll_ns && block_ns > halt_poll_ns)
4098                         shrink_halt_poll_ns(vc);
4099                 /* We slept and our poll time is too small */
4100                 else if (vc->halt_poll_ns < halt_poll_ns &&
4101                                 block_ns < halt_poll_ns)
4102                         grow_halt_poll_ns(vc);
4103                 if (vc->halt_poll_ns > halt_poll_ns)
4104                         vc->halt_poll_ns = halt_poll_ns;
4105         } else
4106                 vc->halt_poll_ns = 0;
4107
4108         trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
4109 }
4110
4111 /*
4112  * This never fails for a radix guest, as none of the operations it does
4113  * for a radix guest can fail or have a way to report failure.
4114  */
4115 static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
4116 {
4117         int r = 0;
4118         struct kvm *kvm = vcpu->kvm;
4119
4120         mutex_lock(&kvm->arch.mmu_setup_lock);
4121         if (!kvm->arch.mmu_ready) {
4122                 if (!kvm_is_radix(kvm))
4123                         r = kvmppc_hv_setup_htab_rma(vcpu);
4124                 if (!r) {
4125                         if (cpu_has_feature(CPU_FTR_ARCH_300))
4126                                 kvmppc_setup_partition_table(kvm);
4127                         kvm->arch.mmu_ready = 1;
4128                 }
4129         }
4130         mutex_unlock(&kvm->arch.mmu_setup_lock);
4131         return r;
4132 }
4133
4134 static int kvmppc_run_vcpu(struct kvm_vcpu *vcpu)
4135 {
4136         struct kvm_run *run = vcpu->run;
4137         int n_ceded, i, r;
4138         struct kvmppc_vcore *vc;
4139         struct kvm_vcpu *v;
4140
4141         trace_kvmppc_run_vcpu_enter(vcpu);
4142
4143         run->exit_reason = 0;
4144         vcpu->arch.ret = RESUME_GUEST;
4145         vcpu->arch.trap = 0;
4146         kvmppc_update_vpas(vcpu);
4147
4148         /*
4149          * Synchronize with other threads in this virtual core
4150          */
4151         vc = vcpu->arch.vcore;
4152         spin_lock(&vc->lock);
4153         vcpu->arch.ceded = 0;
4154         vcpu->arch.run_task = current;
4155         vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
4156         vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
4157         vcpu->arch.busy_preempt = TB_NIL;
4158         WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], vcpu);
4159         ++vc->n_runnable;
4160
4161         /*
4162          * This happens the first time this is called for a vcpu.
4163          * If the vcore is already running, we may be able to start
4164          * this thread straight away and have it join in.
4165          */
4166         if (!signal_pending(current)) {
4167                 if ((vc->vcore_state == VCORE_PIGGYBACK ||
4168                      vc->vcore_state == VCORE_RUNNING) &&
4169                            !VCORE_IS_EXITING(vc)) {
4170                         kvmppc_create_dtl_entry(vcpu, vc);
4171                         kvmppc_start_thread(vcpu, vc);
4172                         trace_kvm_guest_enter(vcpu);
4173                 } else if (vc->vcore_state == VCORE_SLEEPING) {
4174                         rcuwait_wake_up(&vc->wait);
4175                 }
4176
4177         }
4178
4179         while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
4180                !signal_pending(current)) {
4181                 /* See if the MMU is ready to go */
4182                 if (!vcpu->kvm->arch.mmu_ready) {
4183                         spin_unlock(&vc->lock);
4184                         r = kvmhv_setup_mmu(vcpu);
4185                         spin_lock(&vc->lock);
4186                         if (r) {
4187                                 run->exit_reason = KVM_EXIT_FAIL_ENTRY;
4188                                 run->fail_entry.
4189                                         hardware_entry_failure_reason = 0;
4190                                 vcpu->arch.ret = r;
4191                                 break;
4192                         }
4193                 }
4194
4195                 if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
4196                         kvmppc_vcore_end_preempt(vc);
4197
4198                 if (vc->vcore_state != VCORE_INACTIVE) {
4199                         kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE);
4200                         continue;
4201                 }
4202                 for_each_runnable_thread(i, v, vc) {
4203                         kvmppc_core_prepare_to_enter(v);
4204                         if (signal_pending(v->arch.run_task)) {
4205                                 kvmppc_remove_runnable(vc, v);
4206                                 v->stat.signal_exits++;
4207                                 v->run->exit_reason = KVM_EXIT_INTR;
4208                                 v->arch.ret = -EINTR;
4209                                 wake_up(&v->arch.cpu_run);
4210                         }
4211                 }
4212                 if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
4213                         break;
4214                 n_ceded = 0;
4215                 for_each_runnable_thread(i, v, vc) {
4216                         if (!kvmppc_vcpu_woken(v))
4217                                 n_ceded += v->arch.ceded;
4218                         else
4219                                 v->arch.ceded = 0;
4220                 }
4221                 vc->runner = vcpu;
4222                 if (n_ceded == vc->n_runnable) {
4223                         kvmppc_vcore_blocked(vc);
4224                 } else if (need_resched()) {
4225                         kvmppc_vcore_preempt(vc);
4226                         /* Let something else run */
4227                         cond_resched_lock(&vc->lock);
4228                         if (vc->vcore_state == VCORE_PREEMPT)
4229                                 kvmppc_vcore_end_preempt(vc);
4230                 } else {
4231                         kvmppc_run_core(vc);
4232                 }
4233                 vc->runner = NULL;
4234         }
4235
4236         while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
4237                (vc->vcore_state == VCORE_RUNNING ||
4238                 vc->vcore_state == VCORE_EXITING ||
4239                 vc->vcore_state == VCORE_PIGGYBACK))
4240                 kvmppc_wait_for_exec(vc, vcpu, TASK_UNINTERRUPTIBLE);
4241
4242         if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
4243                 kvmppc_vcore_end_preempt(vc);
4244
4245         if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
4246                 kvmppc_remove_runnable(vc, vcpu);
4247                 vcpu->stat.signal_exits++;
4248                 run->exit_reason = KVM_EXIT_INTR;
4249                 vcpu->arch.ret = -EINTR;
4250         }
4251
4252         if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
4253                 /* Wake up some vcpu to run the core */
4254                 i = -1;
4255                 v = next_runnable_thread(vc, &i);
4256                 wake_up(&v->arch.cpu_run);
4257         }
4258
4259         trace_kvmppc_run_vcpu_exit(vcpu);
4260         spin_unlock(&vc->lock);
4261         return vcpu->arch.ret;
4262 }
4263
4264 int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
4265                           unsigned long lpcr)
4266 {
4267         struct kvm_run *run = vcpu->run;
4268         int trap, r, pcpu;
4269         int srcu_idx;
4270         struct kvmppc_vcore *vc;
4271         struct kvm *kvm = vcpu->kvm;
4272         struct kvm_nested_guest *nested = vcpu->arch.nested;
4273
4274         trace_kvmppc_run_vcpu_enter(vcpu);
4275
4276         run->exit_reason = 0;
4277         vcpu->arch.ret = RESUME_GUEST;
4278         vcpu->arch.trap = 0;
4279
4280         vc = vcpu->arch.vcore;
4281         vcpu->arch.ceded = 0;
4282         vcpu->arch.run_task = current;
4283         vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
4284         vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
4285         vcpu->arch.busy_preempt = TB_NIL;
4286         vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
4287         vc->runnable_threads[0] = vcpu;
4288         vc->n_runnable = 1;
4289         vc->runner = vcpu;
4290
4291         /* See if the MMU is ready to go */
4292         if (!kvm->arch.mmu_ready) {
4293                 r = kvmhv_setup_mmu(vcpu);
4294                 if (r) {
4295                         run->exit_reason = KVM_EXIT_FAIL_ENTRY;
4296                         run->fail_entry.hardware_entry_failure_reason = 0;
4297                         vcpu->arch.ret = r;
4298                         return r;
4299                 }
4300         }
4301
4302         if (need_resched())
4303                 cond_resched();
4304
4305         kvmppc_update_vpas(vcpu);
4306
4307         init_vcore_to_run(vc);
4308         vc->preempt_tb = TB_NIL;
4309
4310         preempt_disable();
4311         pcpu = smp_processor_id();
4312         vc->pcpu = pcpu;
4313         if (kvm_is_radix(kvm))
4314                 kvmppc_prepare_radix_vcpu(vcpu, pcpu);
4315
4316         local_irq_disable();
4317         hard_irq_disable();
4318         if (signal_pending(current))
4319                 goto sigpend;
4320         if (lazy_irq_pending() || need_resched() || !kvm->arch.mmu_ready)
4321                 goto out;
4322
4323         if (!nested) {
4324                 kvmppc_core_prepare_to_enter(vcpu);
4325                 if (vcpu->arch.doorbell_request) {
4326                         vc->dpdes = 1;
4327                         smp_wmb();
4328                         vcpu->arch.doorbell_request = 0;
4329                 }
4330                 if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
4331                              &vcpu->arch.pending_exceptions))
4332                         lpcr |= LPCR_MER;
4333         } else if (vcpu->arch.pending_exceptions ||
4334                    vcpu->arch.doorbell_request ||
4335                    xive_interrupt_pending(vcpu)) {
4336                 vcpu->arch.ret = RESUME_HOST;
4337                 goto out;
4338         }
4339
4340         kvmppc_clear_host_core(pcpu);
4341
4342         local_paca->kvm_hstate.napping = 0;
4343         local_paca->kvm_hstate.kvm_split_mode = NULL;
4344         kvmppc_start_thread(vcpu, vc);
4345         kvmppc_create_dtl_entry(vcpu, vc);
4346         trace_kvm_guest_enter(vcpu);
4347
4348         vc->vcore_state = VCORE_RUNNING;
4349         trace_kvmppc_run_core(vc, 0);
4350
4351         guest_enter_irqoff();
4352
4353         srcu_idx = srcu_read_lock(&kvm->srcu);
4354
4355         this_cpu_disable_ftrace();
4356
4357         /* Tell lockdep that we're about to enable interrupts */
4358         trace_hardirqs_on();
4359
4360         trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr);
4361         vcpu->arch.trap = trap;
4362
4363         trace_hardirqs_off();
4364
4365         this_cpu_enable_ftrace();
4366
4367         srcu_read_unlock(&kvm->srcu, srcu_idx);
4368
4369         set_irq_happened(trap);
4370
4371         kvmppc_set_host_core(pcpu);
4372
4373         guest_exit_irqoff();
4374
4375         local_irq_enable();
4376
4377         cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest);
4378
4379         preempt_enable();
4380
4381         /*
4382          * cancel pending decrementer exception if DEC is now positive, or if
4383          * entering a nested guest in which case the decrementer is now owned
4384          * by L2 and the L1 decrementer is provided in hdec_expires
4385          */
4386         if (kvmppc_core_pending_dec(vcpu) &&
4387                         ((get_tb() < vcpu->arch.dec_expires) ||
4388                          (trap == BOOK3S_INTERRUPT_SYSCALL &&
4389                           kvmppc_get_gpr(vcpu, 3) == H_ENTER_NESTED)))
4390                 kvmppc_core_dequeue_dec(vcpu);
4391
4392         trace_kvm_guest_exit(vcpu);
4393         r = RESUME_GUEST;
4394         if (trap) {
4395                 if (!nested)
4396                         r = kvmppc_handle_exit_hv(vcpu, current);
4397                 else
4398                         r = kvmppc_handle_nested_exit(vcpu);
4399         }
4400         vcpu->arch.ret = r;
4401
4402         if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded &&
4403             !kvmppc_vcpu_woken(vcpu)) {
4404                 kvmppc_set_timer(vcpu);
4405                 while (vcpu->arch.ceded && !kvmppc_vcpu_woken(vcpu)) {
4406                         if (signal_pending(current)) {
4407                                 vcpu->stat.signal_exits++;
4408                                 run->exit_reason = KVM_EXIT_INTR;
4409                                 vcpu->arch.ret = -EINTR;
4410                                 break;
4411                         }
4412                         spin_lock(&vc->lock);
4413                         kvmppc_vcore_blocked(vc);
4414                         spin_unlock(&vc->lock);
4415                 }
4416         }
4417         vcpu->arch.ceded = 0;
4418
4419         vc->vcore_state = VCORE_INACTIVE;
4420         trace_kvmppc_run_core(vc, 1);
4421
4422  done:
4423         kvmppc_remove_runnable(vc, vcpu);
4424         trace_kvmppc_run_vcpu_exit(vcpu);
4425
4426         return vcpu->arch.ret;
4427
4428  sigpend:
4429         vcpu->stat.signal_exits++;
4430         run->exit_reason = KVM_EXIT_INTR;
4431         vcpu->arch.ret = -EINTR;
4432  out:
4433         local_irq_enable();
4434         preempt_enable();
4435         goto done;
4436 }
4437
4438 static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
4439 {
4440         struct kvm_run *run = vcpu->run;
4441         int r;
4442         int srcu_idx;
4443         unsigned long ebb_regs[3] = {}; /* shut up GCC */
4444         unsigned long user_tar = 0;
4445         unsigned int user_vrsave;
4446         struct kvm *kvm;
4447
4448         if (!vcpu->arch.sane) {
4449                 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4450                 return -EINVAL;
4451         }
4452
4453         /*
4454          * Don't allow entry with a suspended transaction, because
4455          * the guest entry/exit code will lose it.
4456          * If the guest has TM enabled, save away their TM-related SPRs
4457          * (they will get restored by the TM unavailable interrupt).
4458          */
4459 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
4460         if (cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
4461             (current->thread.regs->msr & MSR_TM)) {
4462                 if (MSR_TM_ACTIVE(current->thread.regs->msr)) {
4463                         run->exit_reason = KVM_EXIT_FAIL_ENTRY;
4464                         run->fail_entry.hardware_entry_failure_reason = 0;
4465                         return -EINVAL;
4466                 }
4467                 /* Enable TM so we can read the TM SPRs */
4468                 mtmsr(mfmsr() | MSR_TM);
4469                 current->thread.tm_tfhar = mfspr(SPRN_TFHAR);
4470                 current->thread.tm_tfiar = mfspr(SPRN_TFIAR);
4471                 current->thread.tm_texasr = mfspr(SPRN_TEXASR);
4472                 current->thread.regs->msr &= ~MSR_TM;
4473         }
4474 #endif
4475
4476         /*
4477          * Force online to 1 for the sake of old userspace which doesn't
4478          * set it.
4479          */
4480         if (!vcpu->arch.online) {
4481                 atomic_inc(&vcpu->arch.vcore->online_count);
4482                 vcpu->arch.online = 1;
4483         }
4484
4485         kvmppc_core_prepare_to_enter(vcpu);
4486
4487         /* No need to go into the guest when all we'll do is come back out */
4488         if (signal_pending(current)) {
4489                 run->exit_reason = KVM_EXIT_INTR;
4490                 return -EINTR;
4491         }
4492
4493         kvm = vcpu->kvm;
4494         atomic_inc(&kvm->arch.vcpus_running);
4495         /* Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt */
4496         smp_mb();
4497
4498         flush_all_to_thread(current);
4499
4500         /* Save userspace EBB and other register values */
4501         if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
4502                 ebb_regs[0] = mfspr(SPRN_EBBHR);
4503                 ebb_regs[1] = mfspr(SPRN_EBBRR);
4504                 ebb_regs[2] = mfspr(SPRN_BESCR);
4505                 user_tar = mfspr(SPRN_TAR);
4506         }
4507         user_vrsave = mfspr(SPRN_VRSAVE);
4508
4509         vcpu->arch.waitp = &vcpu->arch.vcore->wait;
4510         vcpu->arch.pgdir = kvm->mm->pgd;
4511         vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
4512
4513         do {
4514                 if (cpu_has_feature(CPU_FTR_ARCH_300))
4515                         r = kvmhv_run_single_vcpu(vcpu, ~(u64)0,
4516                                                   vcpu->arch.vcore->lpcr);
4517                 else
4518                         r = kvmppc_run_vcpu(vcpu);
4519
4520                 if (run->exit_reason == KVM_EXIT_PAPR_HCALL) {
4521                         if (WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_PR)) {
4522                                 /*
4523                                  * These should have been caught reflected
4524                                  * into the guest by now. Final sanity check:
4525                                  * don't allow userspace to execute hcalls in
4526                                  * the hypervisor.
4527                                  */
4528                                 r = RESUME_GUEST;
4529                                 continue;
4530                         }
4531                         trace_kvm_hcall_enter(vcpu);
4532                         r = kvmppc_pseries_do_hcall(vcpu);
4533                         trace_kvm_hcall_exit(vcpu, r);
4534                         kvmppc_core_prepare_to_enter(vcpu);
4535                 } else if (r == RESUME_PAGE_FAULT) {
4536                         srcu_idx = srcu_read_lock(&kvm->srcu);
4537                         r = kvmppc_book3s_hv_page_fault(vcpu,
4538                                 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
4539                         srcu_read_unlock(&kvm->srcu, srcu_idx);
4540                 } else if (r == RESUME_PASSTHROUGH) {
4541                         if (WARN_ON(xics_on_xive()))
4542                                 r = H_SUCCESS;
4543                         else
4544                                 r = kvmppc_xics_rm_complete(vcpu, 0);
4545                 }
4546         } while (is_kvmppc_resume_guest(r));
4547
4548         /* Restore userspace EBB and other register values */
4549         if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
4550                 mtspr(SPRN_EBBHR, ebb_regs[0]);
4551                 mtspr(SPRN_EBBRR, ebb_regs[1]);
4552                 mtspr(SPRN_BESCR, ebb_regs[2]);
4553                 mtspr(SPRN_TAR, user_tar);
4554         }
4555         mtspr(SPRN_VRSAVE, user_vrsave);
4556
4557         vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
4558         atomic_dec(&kvm->arch.vcpus_running);
4559         return r;
4560 }
4561
4562 static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
4563                                      int shift, int sllp)
4564 {
4565         (*sps)->page_shift = shift;
4566         (*sps)->slb_enc = sllp;
4567         (*sps)->enc[0].page_shift = shift;
4568         (*sps)->enc[0].pte_enc = kvmppc_pgsize_lp_encoding(shift, shift);
4569         /*
4570          * Add 16MB MPSS support (may get filtered out by userspace)
4571          */
4572         if (shift != 24) {
4573                 int penc = kvmppc_pgsize_lp_encoding(shift, 24);
4574                 if (penc != -1) {
4575                         (*sps)->enc[1].page_shift = 24;
4576                         (*sps)->enc[1].pte_enc = penc;
4577                 }
4578         }
4579         (*sps)++;
4580 }
4581
4582 static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
4583                                          struct kvm_ppc_smmu_info *info)
4584 {
4585         struct kvm_ppc_one_seg_page_size *sps;
4586
4587         /*
4588          * POWER7, POWER8 and POWER9 all support 32 storage keys for data.
4589          * POWER7 doesn't support keys for instruction accesses,
4590          * POWER8 and POWER9 do.
4591          */
4592         info->data_keys = 32;
4593         info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0;
4594
4595         /* POWER7, 8 and 9 all have 1T segments and 32-entry SLB */
4596         info->flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS;
4597         info->slb_size = 32;
4598
4599         /* We only support these sizes for now, and no muti-size segments */
4600         sps = &info->sps[0];
4601         kvmppc_add_seg_page_size(&sps, 12, 0);
4602         kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
4603         kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
4604
4605         /* If running as a nested hypervisor, we don't support HPT guests */
4606         if (kvmhv_on_pseries())
4607                 info->flags |= KVM_PPC_NO_HASH;
4608
4609         return 0;
4610 }
4611
4612 /*
4613  * Get (and clear) the dirty memory log for a memory slot.
4614  */
4615 static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
4616                                          struct kvm_dirty_log *log)
4617 {
4618         struct kvm_memslots *slots;
4619         struct kvm_memory_slot *memslot;
4620         int i, r;
4621         unsigned long n;
4622         unsigned long *buf, *p;
4623         struct kvm_vcpu *vcpu;
4624
4625         mutex_lock(&kvm->slots_lock);
4626
4627         r = -EINVAL;
4628         if (log->slot >= KVM_USER_MEM_SLOTS)
4629                 goto out;
4630
4631         slots = kvm_memslots(kvm);
4632         memslot = id_to_memslot(slots, log->slot);
4633         r = -ENOENT;
4634         if (!memslot || !memslot->dirty_bitmap)
4635                 goto out;
4636
4637         /*
4638          * Use second half of bitmap area because both HPT and radix
4639          * accumulate bits in the first half.
4640          */
4641         n = kvm_dirty_bitmap_bytes(memslot);
4642         buf = memslot->dirty_bitmap + n / sizeof(long);
4643         memset(buf, 0, n);
4644
4645         if (kvm_is_radix(kvm))
4646                 r = kvmppc_hv_get_dirty_log_radix(kvm, memslot, buf);
4647         else
4648                 r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf);
4649         if (r)
4650                 goto out;
4651
4652         /*
4653          * We accumulate dirty bits in the first half of the
4654          * memslot's dirty_bitmap area, for when pages are paged
4655          * out or modified by the host directly.  Pick up these
4656          * bits and add them to the map.
4657          */
4658         p = memslot->dirty_bitmap;
4659         for (i = 0; i < n / sizeof(long); ++i)
4660                 buf[i] |= xchg(&p[i], 0);
4661
4662         /* Harvest dirty bits from VPA and DTL updates */
4663         /* Note: we never modify the SLB shadow buffer areas */
4664         kvm_for_each_vcpu(i, vcpu, kvm) {
4665                 spin_lock(&vcpu->arch.vpa_update_lock);
4666                 kvmppc_harvest_vpa_dirty(&vcpu->arch.vpa, memslot, buf);
4667                 kvmppc_harvest_vpa_dirty(&vcpu->arch.dtl, memslot, buf);
4668                 spin_unlock(&vcpu->arch.vpa_update_lock);
4669         }
4670
4671         r = -EFAULT;
4672         if (copy_to_user(log->dirty_bitmap, buf, n))
4673                 goto out;
4674
4675         r = 0;
4676 out:
4677         mutex_unlock(&kvm->slots_lock);
4678         return r;
4679 }
4680
4681 static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *slot)
4682 {
4683         vfree(slot->arch.rmap);
4684         slot->arch.rmap = NULL;
4685 }
4686
4687 static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
4688                                         struct kvm_memory_slot *slot,
4689                                         const struct kvm_userspace_memory_region *mem,
4690                                         enum kvm_mr_change change)
4691 {
4692         unsigned long npages = mem->memory_size >> PAGE_SHIFT;
4693
4694         if (change == KVM_MR_CREATE) {
4695                 slot->arch.rmap = vzalloc(array_size(npages,
4696                                           sizeof(*slot->arch.rmap)));
4697                 if (!slot->arch.rmap)
4698                         return -ENOMEM;
4699         }
4700
4701         return 0;
4702 }
4703
4704 static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
4705                                 const struct kvm_userspace_memory_region *mem,
4706                                 const struct kvm_memory_slot *old,
4707                                 const struct kvm_memory_slot *new,
4708                                 enum kvm_mr_change change)
4709 {
4710         unsigned long npages = mem->memory_size >> PAGE_SHIFT;
4711
4712         /*
4713          * If we are making a new memslot, it might make
4714          * some address that was previously cached as emulated
4715          * MMIO be no longer emulated MMIO, so invalidate
4716          * all the caches of emulated MMIO translations.
4717          */
4718         if (npages)
4719                 atomic64_inc(&kvm->arch.mmio_update);
4720
4721         /*
4722          * For change == KVM_MR_MOVE or KVM_MR_DELETE, higher levels
4723          * have already called kvm_arch_flush_shadow_memslot() to
4724          * flush shadow mappings.  For KVM_MR_CREATE we have no
4725          * previous mappings.  So the only case to handle is
4726          * KVM_MR_FLAGS_ONLY when the KVM_MEM_LOG_DIRTY_PAGES bit
4727          * has been changed.
4728          * For radix guests, we flush on setting KVM_MEM_LOG_DIRTY_PAGES
4729          * to get rid of any THP PTEs in the partition-scoped page tables
4730          * so we can track dirtiness at the page level; we flush when
4731          * clearing KVM_MEM_LOG_DIRTY_PAGES so that we can go back to
4732          * using THP PTEs.
4733          */
4734         if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
4735             ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
4736                 kvmppc_radix_flush_memslot(kvm, old);
4737         /*
4738          * If UV hasn't yet called H_SVM_INIT_START, don't register memslots.
4739          */
4740         if (!kvm->arch.secure_guest)
4741                 return;
4742
4743         switch (change) {
4744         case KVM_MR_CREATE:
4745                 /*
4746                  * @TODO kvmppc_uvmem_memslot_create() can fail and
4747                  * return error. Fix this.
4748                  */
4749                 kvmppc_uvmem_memslot_create(kvm, new);
4750                 break;
4751         case KVM_MR_DELETE:
4752                 kvmppc_uvmem_memslot_delete(kvm, old);
4753                 break;
4754         default:
4755                 /* TODO: Handle KVM_MR_MOVE */
4756                 break;
4757         }
4758 }
4759
4760 /*
4761  * Update LPCR values in kvm->arch and in vcores.
4762  * Caller must hold kvm->arch.mmu_setup_lock (for mutual exclusion
4763  * of kvm->arch.lpcr update).
4764  */
4765 void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask)
4766 {
4767         long int i;
4768         u32 cores_done = 0;
4769
4770         if ((kvm->arch.lpcr & mask) == lpcr)
4771                 return;
4772
4773         kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr;
4774
4775         for (i = 0; i < KVM_MAX_VCORES; ++i) {
4776                 struct kvmppc_vcore *vc = kvm->arch.vcores[i];
4777                 if (!vc)
4778                         continue;
4779
4780                 spin_lock(&vc->lock);
4781                 vc->lpcr = (vc->lpcr & ~mask) | lpcr;
4782                 verify_lpcr(kvm, vc->lpcr);
4783                 spin_unlock(&vc->lock);
4784                 if (++cores_done >= kvm->arch.online_vcores)
4785                         break;
4786         }
4787 }
4788
4789 void kvmppc_setup_partition_table(struct kvm *kvm)
4790 {
4791         unsigned long dw0, dw1;
4792
4793         if (!kvm_is_radix(kvm)) {
4794                 /* PS field - page size for VRMA */
4795                 dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
4796                         ((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1);
4797                 /* HTABSIZE and HTABORG fields */
4798                 dw0 |= kvm->arch.sdr1;
4799
4800                 /* Second dword as set by userspace */
4801                 dw1 = kvm->arch.process_table;
4802         } else {
4803                 dw0 = PATB_HR | radix__get_tree_size() |
4804                         __pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
4805                 dw1 = PATB_GR | kvm->arch.process_table;
4806         }
4807         kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
4808 }
4809
4810 /*
4811  * Set up HPT (hashed page table) and RMA (real-mode area).
4812  * Must be called with kvm->arch.mmu_setup_lock held.
4813  */
4814 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
4815 {
4816         int err = 0;
4817         struct kvm *kvm = vcpu->kvm;
4818         unsigned long hva;
4819         struct kvm_memory_slot *memslot;
4820         struct vm_area_struct *vma;
4821         unsigned long lpcr = 0, senc;
4822         unsigned long psize, porder;
4823         int srcu_idx;
4824
4825         /* Allocate hashed page table (if not done already) and reset it */
4826         if (!kvm->arch.hpt.virt) {
4827                 int order = KVM_DEFAULT_HPT_ORDER;
4828                 struct kvm_hpt_info info;
4829
4830                 err = kvmppc_allocate_hpt(&info, order);
4831                 /* If we get here, it means userspace didn't specify a
4832                  * size explicitly.  So, try successively smaller
4833                  * sizes if the default failed. */
4834                 while ((err == -ENOMEM) && --order >= PPC_MIN_HPT_ORDER)
4835                         err  = kvmppc_allocate_hpt(&info, order);
4836
4837                 if (err < 0) {
4838                         pr_err("KVM: Couldn't alloc HPT\n");
4839                         goto out;
4840                 }
4841
4842                 kvmppc_set_hpt(kvm, &info);
4843         }
4844
4845         /* Look up the memslot for guest physical address 0 */
4846         srcu_idx = srcu_read_lock(&kvm->srcu);
4847         memslot = gfn_to_memslot(kvm, 0);
4848
4849         /* We must have some memory at 0 by now */
4850         err = -EINVAL;
4851         if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
4852                 goto out_srcu;
4853
4854         /* Look up the VMA for the start of this memory slot */
4855         hva = memslot->userspace_addr;
4856         mmap_read_lock(kvm->mm);
4857         vma = find_vma(kvm->mm, hva);
4858         if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO))
4859                 goto up_out;
4860
4861         psize = vma_kernel_pagesize(vma);
4862
4863         mmap_read_unlock(kvm->mm);
4864
4865         /* We can handle 4k, 64k or 16M pages in the VRMA */
4866         if (psize >= 0x1000000)
4867                 psize = 0x1000000;
4868         else if (psize >= 0x10000)
4869                 psize = 0x10000;
4870         else
4871                 psize = 0x1000;
4872         porder = __ilog2(psize);
4873
4874         senc = slb_pgsize_encoding(psize);
4875         kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
4876                 (VRMA_VSID << SLB_VSID_SHIFT_1T);
4877         /* Create HPTEs in the hash page table for the VRMA */
4878         kvmppc_map_vrma(vcpu, memslot, porder);
4879
4880         /* Update VRMASD field in the LPCR */
4881         if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
4882                 /* the -4 is to account for senc values starting at 0x10 */
4883                 lpcr = senc << (LPCR_VRMASD_SH - 4);
4884                 kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
4885         }
4886
4887         /* Order updates to kvm->arch.lpcr etc. vs. mmu_ready */
4888         smp_wmb();
4889         err = 0;
4890  out_srcu:
4891         srcu_read_unlock(&kvm->srcu, srcu_idx);
4892  out:
4893         return err;
4894
4895  up_out:
4896         mmap_read_unlock(kvm->mm);
4897         goto out_srcu;
4898 }
4899
4900 /*
4901  * Must be called with kvm->arch.mmu_setup_lock held and
4902  * mmu_ready = 0 and no vcpus running.
4903  */
4904 int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
4905 {
4906         if (nesting_enabled(kvm))
4907                 kvmhv_release_all_nested(kvm);
4908         kvmppc_rmap_reset(kvm);
4909         kvm->arch.process_table = 0;
4910         /* Mutual exclusion with kvm_unmap_gfn_range etc. */
4911         spin_lock(&kvm->mmu_lock);
4912         kvm->arch.radix = 0;
4913         spin_unlock(&kvm->mmu_lock);
4914         kvmppc_free_radix(kvm);
4915         kvmppc_update_lpcr(kvm, LPCR_VPM1,
4916                            LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
4917         return 0;
4918 }
4919
4920 /*
4921  * Must be called with kvm->arch.mmu_setup_lock held and
4922  * mmu_ready = 0 and no vcpus running.
4923  */
4924 int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
4925 {
4926         int err;
4927
4928         err = kvmppc_init_vm_radix(kvm);
4929         if (err)
4930                 return err;
4931         kvmppc_rmap_reset(kvm);
4932         /* Mutual exclusion with kvm_unmap_gfn_range etc. */
4933         spin_lock(&kvm->mmu_lock);
4934         kvm->arch.radix = 1;
4935         spin_unlock(&kvm->mmu_lock);
4936         kvmppc_free_hpt(&kvm->arch.hpt);
4937         kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
4938                            LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
4939         return 0;
4940 }
4941
4942 #ifdef CONFIG_KVM_XICS
4943 /*
4944  * Allocate a per-core structure for managing state about which cores are
4945  * running in the host versus the guest and for exchanging data between
4946  * real mode KVM and CPU running in the host.
4947  * This is only done for the first VM.
4948  * The allocated structure stays even if all VMs have stopped.
4949  * It is only freed when the kvm-hv module is unloaded.
4950  * It's OK for this routine to fail, we just don't support host
4951  * core operations like redirecting H_IPI wakeups.
4952  */
4953 void kvmppc_alloc_host_rm_ops(void)
4954 {
4955         struct kvmppc_host_rm_ops *ops;
4956         unsigned long l_ops;
4957         int cpu, core;
4958         int size;
4959
4960         /* Not the first time here ? */
4961         if (kvmppc_host_rm_ops_hv != NULL)
4962                 return;
4963
4964         ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL);
4965         if (!ops)
4966                 return;
4967
4968         size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core);
4969         ops->rm_core = kzalloc(size, GFP_KERNEL);
4970
4971         if (!ops->rm_core) {
4972                 kfree(ops);
4973                 return;
4974         }
4975
4976         cpus_read_lock();
4977
4978         for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) {
4979                 if (!cpu_online(cpu))
4980                         continue;
4981
4982                 core = cpu >> threads_shift;
4983                 ops->rm_core[core].rm_state.in_host = 1;
4984         }
4985
4986         ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv;
4987
4988         /*
4989          * Make the contents of the kvmppc_host_rm_ops structure visible
4990          * to other CPUs before we assign it to the global variable.
4991          * Do an atomic assignment (no locks used here), but if someone
4992          * beats us to it, just free our copy and return.
4993          */
4994         smp_wmb();
4995         l_ops = (unsigned long) ops;
4996
4997         if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) {
4998                 cpus_read_unlock();
4999                 kfree(ops->rm_core);
5000                 kfree(ops);
5001                 return;
5002         }
5003
5004         cpuhp_setup_state_nocalls_cpuslocked(CPUHP_KVM_PPC_BOOK3S_PREPARE,
5005                                              "ppc/kvm_book3s:prepare",
5006                                              kvmppc_set_host_core,
5007                                              kvmppc_clear_host_core);
5008         cpus_read_unlock();
5009 }
5010
5011 void kvmppc_free_host_rm_ops(void)
5012 {
5013         if (kvmppc_host_rm_ops_hv) {
5014                 cpuhp_remove_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE);
5015                 kfree(kvmppc_host_rm_ops_hv->rm_core);
5016                 kfree(kvmppc_host_rm_ops_hv);
5017                 kvmppc_host_rm_ops_hv = NULL;
5018         }
5019 }
5020 #endif
5021
5022 static int kvmppc_core_init_vm_hv(struct kvm *kvm)
5023 {
5024         unsigned long lpcr, lpid;
5025         char buf[32];
5026         int ret;
5027
5028         mutex_init(&kvm->arch.uvmem_lock);
5029         INIT_LIST_HEAD(&kvm->arch.uvmem_pfns);
5030         mutex_init(&kvm->arch.mmu_setup_lock);
5031
5032         /* Allocate the guest's logical partition ID */
5033
5034         lpid = kvmppc_alloc_lpid();
5035         if ((long)lpid < 0)
5036                 return -ENOMEM;
5037         kvm->arch.lpid = lpid;
5038
5039         kvmppc_alloc_host_rm_ops();
5040
5041         kvmhv_vm_nested_init(kvm);
5042
5043         /*
5044          * Since we don't flush the TLB when tearing down a VM,
5045          * and this lpid might have previously been used,
5046          * make sure we flush on each core before running the new VM.
5047          * On POWER9, the tlbie in mmu_partition_table_set_entry()
5048          * does this flush for us.
5049          */
5050         if (!cpu_has_feature(CPU_FTR_ARCH_300))
5051                 cpumask_setall(&kvm->arch.need_tlb_flush);
5052
5053         /* Start out with the default set of hcalls enabled */
5054         memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls,
5055                sizeof(kvm->arch.enabled_hcalls));
5056
5057         if (!cpu_has_feature(CPU_FTR_ARCH_300))
5058                 kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
5059
5060         /* Init LPCR for virtual RMA mode */
5061         if (cpu_has_feature(CPU_FTR_HVMODE)) {
5062                 kvm->arch.host_lpid = mfspr(SPRN_LPID);
5063                 kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
5064                 lpcr &= LPCR_PECE | LPCR_LPES;
5065         } else {
5066                 lpcr = 0;
5067         }
5068         lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
5069                 LPCR_VPM0 | LPCR_VPM1;
5070         kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
5071                 (VRMA_VSID << SLB_VSID_SHIFT_1T);
5072         /* On POWER8 turn on online bit to enable PURR/SPURR */
5073         if (cpu_has_feature(CPU_FTR_ARCH_207S))
5074                 lpcr |= LPCR_ONL;
5075         /*
5076          * On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed)
5077          * Set HVICE bit to enable hypervisor virtualization interrupts.
5078          * Set HEIC to prevent OS interrupts to go to hypervisor (should
5079          * be unnecessary but better safe than sorry in case we re-enable
5080          * EE in HV mode with this LPCR still set)
5081          */
5082         if (cpu_has_feature(CPU_FTR_ARCH_300)) {
5083                 lpcr &= ~LPCR_VPM0;
5084                 lpcr |= LPCR_HVICE | LPCR_HEIC;
5085
5086                 /*
5087                  * If xive is enabled, we route 0x500 interrupts directly
5088                  * to the guest.
5089                  */
5090                 if (xics_on_xive())
5091                         lpcr |= LPCR_LPES;
5092         }
5093
5094         /*
5095          * If the host uses radix, the guest starts out as radix.
5096          */
5097         if (radix_enabled()) {
5098                 kvm->arch.radix = 1;
5099                 kvm->arch.mmu_ready = 1;
5100                 lpcr &= ~LPCR_VPM1;
5101                 lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
5102                 ret = kvmppc_init_vm_radix(kvm);
5103                 if (ret) {
5104                         kvmppc_free_lpid(kvm->arch.lpid);
5105                         return ret;
5106                 }
5107                 kvmppc_setup_partition_table(kvm);
5108         }
5109
5110         verify_lpcr(kvm, lpcr);
5111         kvm->arch.lpcr = lpcr;
5112
5113         /* Initialization for future HPT resizes */
5114         kvm->arch.resize_hpt = NULL;
5115
5116         /*
5117          * Work out how many sets the TLB has, for the use of
5118          * the TLB invalidation loop in book3s_hv_rmhandlers.S.
5119          */
5120         if (cpu_has_feature(CPU_FTR_ARCH_31)) {
5121                 /*
5122                  * P10 will flush all the congruence class with a single tlbiel
5123                  */
5124                 kvm->arch.tlb_sets = 1;
5125         } else if (radix_enabled())
5126                 kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX;     /* 128 */
5127         else if (cpu_has_feature(CPU_FTR_ARCH_300))
5128                 kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH;      /* 256 */
5129         else if (cpu_has_feature(CPU_FTR_ARCH_207S))
5130                 kvm->arch.tlb_sets = POWER8_TLB_SETS;           /* 512 */
5131         else
5132                 kvm->arch.tlb_sets = POWER7_TLB_SETS;           /* 128 */
5133
5134         /*
5135          * Track that we now have a HV mode VM active. This blocks secondary
5136          * CPU threads from coming online.
5137          */
5138         if (!cpu_has_feature(CPU_FTR_ARCH_300))
5139                 kvm_hv_vm_activated();
5140
5141         /*
5142          * Initialize smt_mode depending on processor.
5143          * POWER8 and earlier have to use "strict" threading, where
5144          * all vCPUs in a vcore have to run on the same (sub)core,
5145          * whereas on POWER9 the threads can each run a different
5146          * guest.
5147          */
5148         if (!cpu_has_feature(CPU_FTR_ARCH_300))
5149                 kvm->arch.smt_mode = threads_per_subcore;
5150         else
5151                 kvm->arch.smt_mode = 1;
5152         kvm->arch.emul_smt_mode = 1;
5153
5154         /*
5155          * Create a debugfs directory for the VM
5156          */
5157         snprintf(buf, sizeof(buf), "vm%d", current->pid);
5158         kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
5159         kvmppc_mmu_debugfs_init(kvm);
5160         if (radix_enabled())
5161                 kvmhv_radix_debugfs_init(kvm);
5162
5163         return 0;
5164 }
5165
5166 static void kvmppc_free_vcores(struct kvm *kvm)
5167 {
5168         long int i;
5169
5170         for (i = 0; i < KVM_MAX_VCORES; ++i)
5171                 kfree(kvm->arch.vcores[i]);
5172         kvm->arch.online_vcores = 0;
5173 }
5174
5175 static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
5176 {
5177         debugfs_remove_recursive(kvm->arch.debugfs_dir);
5178
5179         if (!cpu_has_feature(CPU_FTR_ARCH_300))
5180                 kvm_hv_vm_deactivated();
5181
5182         kvmppc_free_vcores(kvm);
5183
5184
5185         if (kvm_is_radix(kvm))
5186                 kvmppc_free_radix(kvm);
5187         else
5188                 kvmppc_free_hpt(&kvm->arch.hpt);
5189
5190         /* Perform global invalidation and return lpid to the pool */
5191         if (cpu_has_feature(CPU_FTR_ARCH_300)) {
5192                 if (nesting_enabled(kvm))
5193                         kvmhv_release_all_nested(kvm);
5194                 kvm->arch.process_table = 0;
5195                 if (kvm->arch.secure_guest)
5196                         uv_svm_terminate(kvm->arch.lpid);
5197                 kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
5198         }
5199
5200         kvmppc_free_lpid(kvm->arch.lpid);
5201
5202         kvmppc_free_pimap(kvm);
5203 }
5204
5205 /* We don't need to emulate any privileged instructions or dcbz */
5206 static int kvmppc_core_emulate_op_hv(struct kvm_vcpu *vcpu,
5207                                      unsigned int inst, int *advance)
5208 {
5209         return EMULATE_FAIL;
5210 }
5211
5212 static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn,
5213                                         ulong spr_val)
5214 {
5215         return EMULATE_FAIL;
5216 }
5217
5218 static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn,
5219                                         ulong *spr_val)
5220 {
5221         return EMULATE_FAIL;
5222 }
5223
5224 static int kvmppc_core_check_processor_compat_hv(void)
5225 {
5226         if (cpu_has_feature(CPU_FTR_HVMODE) &&
5227             cpu_has_feature(CPU_FTR_ARCH_206))
5228                 return 0;
5229
5230         /* POWER9 in radix mode is capable of being a nested hypervisor. */
5231         if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
5232                 return 0;
5233
5234         return -EIO;
5235 }
5236
5237 #ifdef CONFIG_KVM_XICS
5238
5239 void kvmppc_free_pimap(struct kvm *kvm)
5240 {
5241         kfree(kvm->arch.pimap);
5242 }
5243
5244 static struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void)
5245 {
5246         return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL);
5247 }
5248
5249 static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
5250 {
5251         struct irq_desc *desc;
5252         struct kvmppc_irq_map *irq_map;
5253         struct kvmppc_passthru_irqmap *pimap;
5254         struct irq_chip *chip;
5255         int i, rc = 0;
5256
5257         if (!kvm_irq_bypass)
5258                 return 1;
5259
5260         desc = irq_to_desc(host_irq);
5261         if (!desc)
5262                 return -EIO;
5263
5264         mutex_lock(&kvm->lock);
5265
5266         pimap = kvm->arch.pimap;
5267         if (pimap == NULL) {
5268                 /* First call, allocate structure to hold IRQ map */
5269                 pimap = kvmppc_alloc_pimap();
5270                 if (pimap == NULL) {
5271                         mutex_unlock(&kvm->lock);
5272                         return -ENOMEM;
5273                 }
5274                 kvm->arch.pimap = pimap;
5275         }
5276
5277         /*
5278          * For now, we only support interrupts for which the EOI operation
5279          * is an OPAL call followed by a write to XIRR, since that's
5280          * what our real-mode EOI code does, or a XIVE interrupt
5281          */
5282         chip = irq_data_get_irq_chip(&desc->irq_data);
5283         if (!chip || !(is_pnv_opal_msi(chip) || is_xive_irq(chip))) {
5284                 pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
5285                         host_irq, guest_gsi);
5286                 mutex_unlock(&kvm->lock);
5287                 return -ENOENT;
5288         }
5289
5290         /*
5291          * See if we already have an entry for this guest IRQ number.
5292          * If it's mapped to a hardware IRQ number, that's an error,
5293          * otherwise re-use this entry.
5294          */
5295         for (i = 0; i < pimap->n_mapped; i++) {
5296                 if (guest_gsi == pimap->mapped[i].v_hwirq) {
5297                         if (pimap->mapped[i].r_hwirq) {
5298                                 mutex_unlock(&kvm->lock);
5299                                 return -EINVAL;
5300                         }
5301                         break;
5302                 }
5303         }
5304
5305         if (i == KVMPPC_PIRQ_MAPPED) {
5306                 mutex_unlock(&kvm->lock);
5307                 return -EAGAIN;         /* table is full */
5308         }
5309
5310         irq_map = &pimap->mapped[i];
5311
5312         irq_map->v_hwirq = guest_gsi;
5313         irq_map->desc = desc;
5314
5315         /*
5316          * Order the above two stores before the next to serialize with
5317          * the KVM real mode handler.
5318          */
5319         smp_wmb();
5320         irq_map->r_hwirq = desc->irq_data.hwirq;
5321
5322         if (i == pimap->n_mapped)
5323                 pimap->n_mapped++;
5324
5325         if (xics_on_xive())
5326                 rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc);
5327         else
5328                 kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
5329         if (rc)
5330                 irq_map->r_hwirq = 0;
5331
5332         mutex_unlock(&kvm->lock);
5333
5334         return 0;
5335 }
5336
5337 static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
5338 {
5339         struct irq_desc *desc;
5340         struct kvmppc_passthru_irqmap *pimap;
5341         int i, rc = 0;
5342
5343         if (!kvm_irq_bypass)
5344                 return 0;
5345
5346         desc = irq_to_desc(host_irq);
5347         if (!desc)
5348                 return -EIO;
5349
5350         mutex_lock(&kvm->lock);
5351         if (!kvm->arch.pimap)
5352                 goto unlock;
5353
5354         pimap = kvm->arch.pimap;
5355
5356         for (i = 0; i < pimap->n_mapped; i++) {
5357                 if (guest_gsi == pimap->mapped[i].v_hwirq)
5358                         break;
5359         }
5360
5361         if (i == pimap->n_mapped) {
5362                 mutex_unlock(&kvm->lock);
5363                 return -ENODEV;
5364         }
5365
5366         if (xics_on_xive())
5367                 rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc);
5368         else
5369                 kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
5370
5371         /* invalidate the entry (what do do on error from the above ?) */
5372         pimap->mapped[i].r_hwirq = 0;
5373
5374         /*
5375          * We don't free this structure even when the count goes to
5376          * zero. The structure is freed when we destroy the VM.
5377          */
5378  unlock:
5379         mutex_unlock(&kvm->lock);
5380         return rc;
5381 }
5382
5383 static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons,
5384                                              struct irq_bypass_producer *prod)
5385 {
5386         int ret = 0;
5387         struct kvm_kernel_irqfd *irqfd =
5388                 container_of(cons, struct kvm_kernel_irqfd, consumer);
5389
5390         irqfd->producer = prod;
5391
5392         ret = kvmppc_set_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
5393         if (ret)
5394                 pr_info("kvmppc_set_passthru_irq (irq %d, gsi %d) fails: %d\n",
5395                         prod->irq, irqfd->gsi, ret);
5396
5397         return ret;
5398 }
5399
5400 static void kvmppc_irq_bypass_del_producer_hv(struct irq_bypass_consumer *cons,
5401                                               struct irq_bypass_producer *prod)
5402 {
5403         int ret;
5404         struct kvm_kernel_irqfd *irqfd =
5405                 container_of(cons, struct kvm_kernel_irqfd, consumer);
5406
5407         irqfd->producer = NULL;
5408
5409         /*
5410          * When producer of consumer is unregistered, we change back to
5411          * default external interrupt handling mode - KVM real mode
5412          * will switch back to host.
5413          */
5414         ret = kvmppc_clr_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
5415         if (ret)
5416                 pr_warn("kvmppc_clr_passthru_irq (irq %d, gsi %d) fails: %d\n",
5417                         prod->irq, irqfd->gsi, ret);
5418 }
5419 #endif
5420
5421 static long kvm_arch_vm_ioctl_hv(struct file *filp,
5422                                  unsigned int ioctl, unsigned long arg)
5423 {
5424         struct kvm *kvm __maybe_unused = filp->private_data;
5425         void __user *argp = (void __user *)arg;
5426         long r;
5427
5428         switch (ioctl) {
5429
5430         case KVM_PPC_ALLOCATE_HTAB: {
5431                 u32 htab_order;
5432
5433                 /* If we're a nested hypervisor, we currently only support radix */
5434                 if (kvmhv_on_pseries()) {
5435                         r = -EOPNOTSUPP;
5436                         break;
5437                 }
5438
5439                 r = -EFAULT;
5440                 if (get_user(htab_order, (u32 __user *)argp))
5441                         break;
5442                 r = kvmppc_alloc_reset_hpt(kvm, htab_order);
5443                 if (r)
5444                         break;
5445                 r = 0;
5446                 break;
5447         }
5448
5449         case KVM_PPC_GET_HTAB_FD: {
5450                 struct kvm_get_htab_fd ghf;
5451
5452                 r = -EFAULT;
5453                 if (copy_from_user(&ghf, argp, sizeof(ghf)))
5454                         break;
5455                 r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
5456                 break;
5457         }
5458
5459         case KVM_PPC_RESIZE_HPT_PREPARE: {
5460                 struct kvm_ppc_resize_hpt rhpt;
5461
5462                 r = -EFAULT;
5463                 if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
5464                         break;
5465
5466                 r = kvm_vm_ioctl_resize_hpt_prepare(kvm, &rhpt);
5467                 break;
5468         }
5469
5470         case KVM_PPC_RESIZE_HPT_COMMIT: {
5471                 struct kvm_ppc_resize_hpt rhpt;
5472
5473                 r = -EFAULT;
5474                 if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
5475                         break;
5476
5477                 r = kvm_vm_ioctl_resize_hpt_commit(kvm, &rhpt);
5478                 break;
5479         }
5480
5481         default:
5482                 r = -ENOTTY;
5483         }
5484
5485         return r;
5486 }
5487
5488 /*
5489  * List of hcall numbers to enable by default.
5490  * For compatibility with old userspace, we enable by default
5491  * all hcalls that were implemented before the hcall-enabling
5492  * facility was added.  Note this list should not include H_RTAS.
5493  */
5494 static unsigned int default_hcall_list[] = {
5495         H_REMOVE,
5496         H_ENTER,
5497         H_READ,
5498         H_PROTECT,
5499         H_BULK_REMOVE,
5500 #ifdef CONFIG_SPAPR_TCE_IOMMU
5501         H_GET_TCE,
5502         H_PUT_TCE,
5503 #endif
5504         H_SET_DABR,
5505         H_SET_XDABR,
5506         H_CEDE,
5507         H_PROD,
5508         H_CONFER,
5509         H_REGISTER_VPA,
5510 #ifdef CONFIG_KVM_XICS
5511         H_EOI,
5512         H_CPPR,
5513         H_IPI,
5514         H_IPOLL,
5515         H_XIRR,
5516         H_XIRR_X,
5517 #endif
5518         0
5519 };
5520
5521 static void init_default_hcalls(void)
5522 {
5523         int i;
5524         unsigned int hcall;
5525
5526         for (i = 0; default_hcall_list[i]; ++i) {
5527                 hcall = default_hcall_list[i];
5528                 WARN_ON(!kvmppc_hcall_impl_hv(hcall));
5529                 __set_bit(hcall / 4, default_enabled_hcalls);
5530         }
5531 }
5532
5533 static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
5534 {
5535         unsigned long lpcr;
5536         int radix;
5537         int err;
5538
5539         /* If not on a POWER9, reject it */
5540         if (!cpu_has_feature(CPU_FTR_ARCH_300))
5541                 return -ENODEV;
5542
5543         /* If any unknown flags set, reject it */
5544         if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
5545                 return -EINVAL;
5546
5547         /* GR (guest radix) bit in process_table field must match */
5548         radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
5549         if (!!(cfg->process_table & PATB_GR) != radix)
5550                 return -EINVAL;
5551
5552         /* Process table size field must be reasonable, i.e. <= 24 */
5553         if ((cfg->process_table & PRTS_MASK) > 24)
5554                 return -EINVAL;
5555
5556         /* We can change a guest to/from radix now, if the host is radix */
5557         if (radix && !radix_enabled())
5558                 return -EINVAL;
5559
5560         /* If we're a nested hypervisor, we currently only support radix */
5561         if (kvmhv_on_pseries() && !radix)
5562                 return -EINVAL;
5563
5564         mutex_lock(&kvm->arch.mmu_setup_lock);
5565         if (radix != kvm_is_radix(kvm)) {
5566                 if (kvm->arch.mmu_ready) {
5567                         kvm->arch.mmu_ready = 0;
5568                         /* order mmu_ready vs. vcpus_running */
5569                         smp_mb();
5570                         if (atomic_read(&kvm->arch.vcpus_running)) {
5571                                 kvm->arch.mmu_ready = 1;
5572                                 err = -EBUSY;
5573                                 goto out_unlock;
5574                         }
5575                 }
5576                 if (radix)
5577                         err = kvmppc_switch_mmu_to_radix(kvm);
5578                 else
5579                         err = kvmppc_switch_mmu_to_hpt(kvm);
5580                 if (err)
5581                         goto out_unlock;
5582         }
5583
5584         kvm->arch.process_table = cfg->process_table;
5585         kvmppc_setup_partition_table(kvm);
5586
5587         lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
5588         kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
5589         err = 0;
5590
5591  out_unlock:
5592         mutex_unlock(&kvm->arch.mmu_setup_lock);
5593         return err;
5594 }
5595
5596 static int kvmhv_enable_nested(struct kvm *kvm)
5597 {
5598         if (!nested)
5599                 return -EPERM;
5600         if (!cpu_has_feature(CPU_FTR_ARCH_300))
5601                 return -ENODEV;
5602         if (!radix_enabled())
5603                 return -ENODEV;
5604
5605         /* kvm == NULL means the caller is testing if the capability exists */
5606         if (kvm)
5607                 kvm->arch.nested_enable = true;
5608         return 0;
5609 }
5610
5611 static int kvmhv_load_from_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
5612                                  int size)
5613 {
5614         int rc = -EINVAL;
5615
5616         if (kvmhv_vcpu_is_radix(vcpu)) {
5617                 rc = kvmhv_copy_from_guest_radix(vcpu, *eaddr, ptr, size);
5618
5619                 if (rc > 0)
5620                         rc = -EINVAL;
5621         }
5622
5623         /* For now quadrants are the only way to access nested guest memory */
5624         if (rc && vcpu->arch.nested)
5625                 rc = -EAGAIN;
5626
5627         return rc;
5628 }
5629
5630 static int kvmhv_store_to_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
5631                                 int size)
5632 {
5633         int rc = -EINVAL;
5634
5635         if (kvmhv_vcpu_is_radix(vcpu)) {
5636                 rc = kvmhv_copy_to_guest_radix(vcpu, *eaddr, ptr, size);
5637
5638                 if (rc > 0)
5639                         rc = -EINVAL;
5640         }
5641
5642         /* For now quadrants are the only way to access nested guest memory */
5643         if (rc && vcpu->arch.nested)
5644                 rc = -EAGAIN;
5645
5646         return rc;
5647 }
5648
5649 static void unpin_vpa_reset(struct kvm *kvm, struct kvmppc_vpa *vpa)
5650 {
5651         unpin_vpa(kvm, vpa);
5652         vpa->gpa = 0;
5653         vpa->pinned_addr = NULL;
5654         vpa->dirty = false;
5655         vpa->update_pending = 0;
5656 }
5657
5658 /*
5659  * Enable a guest to become a secure VM, or test whether
5660  * that could be enabled.
5661  * Called when the KVM_CAP_PPC_SECURE_GUEST capability is
5662  * tested (kvm == NULL) or enabled (kvm != NULL).
5663  */
5664 static int kvmhv_enable_svm(struct kvm *kvm)
5665 {
5666         if (!kvmppc_uvmem_available())
5667                 return -EINVAL;
5668         if (kvm)
5669                 kvm->arch.svm_enabled = 1;
5670         return 0;
5671 }
5672
5673 /*
5674  *  IOCTL handler to turn off secure mode of guest
5675  *
5676  * - Release all device pages
5677  * - Issue ucall to terminate the guest on the UV side
5678  * - Unpin the VPA pages.
5679  * - Reinit the partition scoped page tables
5680  */
5681 static int kvmhv_svm_off(struct kvm *kvm)
5682 {
5683         struct kvm_vcpu *vcpu;
5684         int mmu_was_ready;
5685         int srcu_idx;
5686         int ret = 0;
5687         int i;
5688
5689         if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
5690                 return ret;
5691
5692         mutex_lock(&kvm->arch.mmu_setup_lock);
5693         mmu_was_ready = kvm->arch.mmu_ready;
5694         if (kvm->arch.mmu_ready) {
5695                 kvm->arch.mmu_ready = 0;
5696                 /* order mmu_ready vs. vcpus_running */
5697                 smp_mb();
5698                 if (atomic_read(&kvm->arch.vcpus_running)) {
5699                         kvm->arch.mmu_ready = 1;
5700                         ret = -EBUSY;
5701                         goto out;
5702                 }
5703         }
5704
5705         srcu_idx = srcu_read_lock(&kvm->srcu);
5706         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5707                 struct kvm_memory_slot *memslot;
5708                 struct kvm_memslots *slots = __kvm_memslots(kvm, i);
5709
5710                 if (!slots)
5711                         continue;
5712
5713                 kvm_for_each_memslot(memslot, slots) {
5714                         kvmppc_uvmem_drop_pages(memslot, kvm, true);
5715                         uv_unregister_mem_slot(kvm->arch.lpid, memslot->id);
5716                 }
5717         }
5718         srcu_read_unlock(&kvm->srcu, srcu_idx);
5719
5720         ret = uv_svm_terminate(kvm->arch.lpid);
5721         if (ret != U_SUCCESS) {
5722                 ret = -EINVAL;
5723                 goto out;
5724         }
5725
5726         /*
5727          * When secure guest is reset, all the guest pages are sent
5728          * to UV via UV_PAGE_IN before the non-boot vcpus get a
5729          * chance to run and unpin their VPA pages. Unpinning of all
5730          * VPA pages is done here explicitly so that VPA pages
5731          * can be migrated to the secure side.
5732          *
5733          * This is required to for the secure SMP guest to reboot
5734          * correctly.
5735          */
5736         kvm_for_each_vcpu(i, vcpu, kvm) {
5737                 spin_lock(&vcpu->arch.vpa_update_lock);
5738                 unpin_vpa_reset(kvm, &vcpu->arch.dtl);
5739                 unpin_vpa_reset(kvm, &vcpu->arch.slb_shadow);
5740                 unpin_vpa_reset(kvm, &vcpu->arch.vpa);
5741                 spin_unlock(&vcpu->arch.vpa_update_lock);
5742         }
5743
5744         kvmppc_setup_partition_table(kvm);
5745         kvm->arch.secure_guest = 0;
5746         kvm->arch.mmu_ready = mmu_was_ready;
5747 out:
5748         mutex_unlock(&kvm->arch.mmu_setup_lock);
5749         return ret;
5750 }
5751
5752 static int kvmhv_enable_dawr1(struct kvm *kvm)
5753 {
5754         if (!cpu_has_feature(CPU_FTR_DAWR1))
5755                 return -ENODEV;
5756
5757         /* kvm == NULL means the caller is testing if the capability exists */
5758         if (kvm)
5759                 kvm->arch.dawr1_enabled = true;
5760         return 0;
5761 }
5762
5763 static bool kvmppc_hash_v3_possible(void)
5764 {
5765         if (radix_enabled() && no_mixing_hpt_and_radix)
5766                 return false;
5767
5768         return cpu_has_feature(CPU_FTR_ARCH_300) &&
5769                 cpu_has_feature(CPU_FTR_HVMODE);
5770 }
5771
5772 static struct kvmppc_ops kvm_ops_hv = {
5773         .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
5774         .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
5775         .get_one_reg = kvmppc_get_one_reg_hv,
5776         .set_one_reg = kvmppc_set_one_reg_hv,
5777         .vcpu_load   = kvmppc_core_vcpu_load_hv,
5778         .vcpu_put    = kvmppc_core_vcpu_put_hv,
5779         .inject_interrupt = kvmppc_inject_interrupt_hv,
5780         .set_msr     = kvmppc_set_msr_hv,
5781         .vcpu_run    = kvmppc_vcpu_run_hv,
5782         .vcpu_create = kvmppc_core_vcpu_create_hv,
5783         .vcpu_free   = kvmppc_core_vcpu_free_hv,
5784         .check_requests = kvmppc_core_check_requests_hv,
5785         .get_dirty_log  = kvm_vm_ioctl_get_dirty_log_hv,
5786         .flush_memslot  = kvmppc_core_flush_memslot_hv,
5787         .prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
5788         .commit_memory_region  = kvmppc_core_commit_memory_region_hv,
5789         .unmap_gfn_range = kvm_unmap_gfn_range_hv,
5790         .age_gfn = kvm_age_gfn_hv,
5791         .test_age_gfn = kvm_test_age_gfn_hv,
5792         .set_spte_gfn = kvm_set_spte_gfn_hv,
5793         .free_memslot = kvmppc_core_free_memslot_hv,
5794         .init_vm =  kvmppc_core_init_vm_hv,
5795         .destroy_vm = kvmppc_core_destroy_vm_hv,
5796         .get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv,
5797         .emulate_op = kvmppc_core_emulate_op_hv,
5798         .emulate_mtspr = kvmppc_core_emulate_mtspr_hv,
5799         .emulate_mfspr = kvmppc_core_emulate_mfspr_hv,
5800         .fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
5801         .arch_vm_ioctl  = kvm_arch_vm_ioctl_hv,
5802         .hcall_implemented = kvmppc_hcall_impl_hv,
5803 #ifdef CONFIG_KVM_XICS
5804         .irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv,
5805         .irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv,
5806 #endif
5807         .configure_mmu = kvmhv_configure_mmu,
5808         .get_rmmu_info = kvmhv_get_rmmu_info,
5809         .set_smt_mode = kvmhv_set_smt_mode,
5810         .enable_nested = kvmhv_enable_nested,
5811         .load_from_eaddr = kvmhv_load_from_eaddr,
5812         .store_to_eaddr = kvmhv_store_to_eaddr,
5813         .enable_svm = kvmhv_enable_svm,
5814         .svm_off = kvmhv_svm_off,
5815         .enable_dawr1 = kvmhv_enable_dawr1,
5816         .hash_v3_possible = kvmppc_hash_v3_possible,
5817 };
5818
5819 static int kvm_init_subcore_bitmap(void)
5820 {
5821         int i, j;
5822         int nr_cores = cpu_nr_cores();
5823         struct sibling_subcore_state *sibling_subcore_state;
5824
5825         for (i = 0; i < nr_cores; i++) {
5826                 int first_cpu = i * threads_per_core;
5827                 int node = cpu_to_node(first_cpu);
5828
5829                 /* Ignore if it is already allocated. */
5830                 if (paca_ptrs[first_cpu]->sibling_subcore_state)
5831                         continue;
5832
5833                 sibling_subcore_state =
5834                         kzalloc_node(sizeof(struct sibling_subcore_state),
5835                                                         GFP_KERNEL, node);
5836                 if (!sibling_subcore_state)
5837                         return -ENOMEM;
5838
5839
5840                 for (j = 0; j < threads_per_core; j++) {
5841                         int cpu = first_cpu + j;
5842
5843                         paca_ptrs[cpu]->sibling_subcore_state =
5844                                                 sibling_subcore_state;
5845                 }
5846         }
5847         return 0;
5848 }
5849
5850 static int kvmppc_radix_possible(void)
5851 {
5852         return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled();
5853 }
5854
5855 static int kvmppc_book3s_init_hv(void)
5856 {
5857         int r;
5858
5859         if (!tlbie_capable) {
5860                 pr_err("KVM-HV: Host does not support TLBIE\n");
5861                 return -ENODEV;
5862         }
5863
5864         /*
5865          * FIXME!! Do we need to check on all cpus ?
5866          */
5867         r = kvmppc_core_check_processor_compat_hv();
5868         if (r < 0)
5869                 return -ENODEV;
5870
5871         r = kvmhv_nested_init();
5872         if (r)
5873                 return r;
5874
5875         r = kvm_init_subcore_bitmap();
5876         if (r)
5877                 return r;
5878
5879         /*
5880          * We need a way of accessing the XICS interrupt controller,
5881          * either directly, via paca_ptrs[cpu]->kvm_hstate.xics_phys, or
5882          * indirectly, via OPAL.
5883          */
5884 #ifdef CONFIG_SMP
5885         if (!xics_on_xive() && !kvmhv_on_pseries() &&
5886             !local_paca->kvm_hstate.xics_phys) {
5887                 struct device_node *np;
5888
5889                 np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
5890                 if (!np) {
5891                         pr_err("KVM-HV: Cannot determine method for accessing XICS\n");
5892                         return -ENODEV;
5893                 }
5894                 /* presence of intc confirmed - node can be dropped again */
5895                 of_node_put(np);
5896         }
5897 #endif
5898
5899         kvm_ops_hv.owner = THIS_MODULE;
5900         kvmppc_hv_ops = &kvm_ops_hv;
5901
5902         init_default_hcalls();
5903
5904         init_vcore_lists();
5905
5906         r = kvmppc_mmu_hv_init();
5907         if (r)
5908                 return r;
5909
5910         if (kvmppc_radix_possible())
5911                 r = kvmppc_radix_init();
5912
5913         /*
5914          * POWER9 chips before version 2.02 can't have some threads in
5915          * HPT mode and some in radix mode on the same core.
5916          */
5917         if (cpu_has_feature(CPU_FTR_ARCH_300)) {
5918                 unsigned int pvr = mfspr(SPRN_PVR);
5919                 if ((pvr >> 16) == PVR_POWER9 &&
5920                     (((pvr & 0xe000) == 0 && (pvr & 0xfff) < 0x202) ||
5921                      ((pvr & 0xe000) == 0x2000 && (pvr & 0xfff) < 0x101)))
5922                         no_mixing_hpt_and_radix = true;
5923         }
5924
5925         r = kvmppc_uvmem_init();
5926         if (r < 0)
5927                 pr_err("KVM-HV: kvmppc_uvmem_init failed %d\n", r);
5928
5929         return r;
5930 }
5931
5932 static void kvmppc_book3s_exit_hv(void)
5933 {
5934         kvmppc_uvmem_free();
5935         kvmppc_free_host_rm_ops();
5936         if (kvmppc_radix_possible())
5937                 kvmppc_radix_exit();
5938         kvmppc_hv_ops = NULL;
5939         kvmhv_nested_exit();
5940 }
5941
5942 module_init(kvmppc_book3s_init_hv);
5943 module_exit(kvmppc_book3s_exit_hv);
5944 MODULE_LICENSE("GPL");
5945 MODULE_ALIAS_MISCDEV(KVM_MINOR);
5946 MODULE_ALIAS("devname:kvm");