Merge branch kvm-arm64/vcpu-first-run into kvmarm-master/next
[linux-2.6-microblaze.git] / arch / powerpc / kvm / book3s_xive_native.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2017-2019, IBM Corporation.
4  */
5
6 #define pr_fmt(fmt) "xive-kvm: " fmt
7
8 #include <linux/kernel.h>
9 #include <linux/kvm_host.h>
10 #include <linux/err.h>
11 #include <linux/gfp.h>
12 #include <linux/spinlock.h>
13 #include <linux/delay.h>
14 #include <linux/file.h>
15 #include <linux/irqdomain.h>
16 #include <asm/uaccess.h>
17 #include <asm/kvm_book3s.h>
18 #include <asm/kvm_ppc.h>
19 #include <asm/hvcall.h>
20 #include <asm/xive.h>
21 #include <asm/xive-regs.h>
22 #include <asm/debug.h>
23 #include <asm/opal.h>
24
25 #include <linux/debugfs.h>
26 #include <linux/seq_file.h>
27
28 #include "book3s_xive.h"
29
30 static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
31 {
32         u64 val;
33
34         /*
35          * The KVM XIVE native device does not use the XIVE_ESB_SET_PQ_10
36          * load operation, so there is no need to enforce load-after-store
37          * ordering.
38          */
39
40         val = in_be64(xd->eoi_mmio + offset);
41         return (u8)val;
42 }
43
44 static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio)
45 {
46         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
47         struct xive_q *q = &xc->queues[prio];
48
49         xive_native_disable_queue(xc->vp_id, q, prio);
50         if (q->qpage) {
51                 put_page(virt_to_page(q->qpage));
52                 q->qpage = NULL;
53         }
54 }
55
56 static int kvmppc_xive_native_configure_queue(u32 vp_id, struct xive_q *q,
57                                               u8 prio, __be32 *qpage,
58                                               u32 order, bool can_escalate)
59 {
60         int rc;
61         __be32 *qpage_prev = q->qpage;
62
63         rc = xive_native_configure_queue(vp_id, q, prio, qpage, order,
64                                          can_escalate);
65         if (rc)
66                 return rc;
67
68         if (qpage_prev)
69                 put_page(virt_to_page(qpage_prev));
70
71         return rc;
72 }
73
74 void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
75 {
76         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
77         int i;
78
79         if (!kvmppc_xive_enabled(vcpu))
80                 return;
81
82         if (!xc)
83                 return;
84
85         pr_devel("native_cleanup_vcpu(cpu=%d)\n", xc->server_num);
86
87         /* Ensure no interrupt is still routed to that VP */
88         xc->valid = false;
89         kvmppc_xive_disable_vcpu_interrupts(vcpu);
90
91         /* Free escalations */
92         for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
93                 /* Free the escalation irq */
94                 if (xc->esc_virq[i]) {
95                         if (kvmppc_xive_has_single_escalation(xc->xive))
96                                 xive_cleanup_single_escalation(vcpu, xc,
97                                                         xc->esc_virq[i]);
98                         free_irq(xc->esc_virq[i], vcpu);
99                         irq_dispose_mapping(xc->esc_virq[i]);
100                         kfree(xc->esc_virq_names[i]);
101                         xc->esc_virq[i] = 0;
102                 }
103         }
104
105         /* Disable the VP */
106         xive_native_disable_vp(xc->vp_id);
107
108         /* Clear the cam word so guest entry won't try to push context */
109         vcpu->arch.xive_cam_word = 0;
110
111         /* Free the queues */
112         for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
113                 kvmppc_xive_native_cleanup_queue(vcpu, i);
114         }
115
116         /* Free the VP */
117         kfree(xc);
118
119         /* Cleanup the vcpu */
120         vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
121         vcpu->arch.xive_vcpu = NULL;
122 }
123
124 int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
125                                     struct kvm_vcpu *vcpu, u32 server_num)
126 {
127         struct kvmppc_xive *xive = dev->private;
128         struct kvmppc_xive_vcpu *xc = NULL;
129         int rc;
130         u32 vp_id;
131
132         pr_devel("native_connect_vcpu(server=%d)\n", server_num);
133
134         if (dev->ops != &kvm_xive_native_ops) {
135                 pr_devel("Wrong ops !\n");
136                 return -EPERM;
137         }
138         if (xive->kvm != vcpu->kvm)
139                 return -EPERM;
140         if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
141                 return -EBUSY;
142
143         mutex_lock(&xive->lock);
144
145         rc = kvmppc_xive_compute_vp_id(xive, server_num, &vp_id);
146         if (rc)
147                 goto bail;
148
149         xc = kzalloc(sizeof(*xc), GFP_KERNEL);
150         if (!xc) {
151                 rc = -ENOMEM;
152                 goto bail;
153         }
154
155         vcpu->arch.xive_vcpu = xc;
156         xc->xive = xive;
157         xc->vcpu = vcpu;
158         xc->server_num = server_num;
159
160         xc->vp_id = vp_id;
161         xc->valid = true;
162         vcpu->arch.irq_type = KVMPPC_IRQ_XIVE;
163
164         rc = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
165         if (rc) {
166                 pr_err("Failed to get VP info from OPAL: %d\n", rc);
167                 goto bail;
168         }
169
170         if (!kvmppc_xive_check_save_restore(vcpu)) {
171                 pr_err("inconsistent save-restore setup for VCPU %d\n", server_num);
172                 rc = -EIO;
173                 goto bail;
174         }
175
176         /*
177          * Enable the VP first as the single escalation mode will
178          * affect escalation interrupts numbering
179          */
180         rc = xive_native_enable_vp(xc->vp_id, kvmppc_xive_has_single_escalation(xive));
181         if (rc) {
182                 pr_err("Failed to enable VP in OPAL: %d\n", rc);
183                 goto bail;
184         }
185
186         /* Configure VCPU fields for use by assembly push/pull */
187         vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
188         vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
189
190         /* TODO: reset all queues to a clean state ? */
191 bail:
192         mutex_unlock(&xive->lock);
193         if (rc)
194                 kvmppc_xive_native_cleanup_vcpu(vcpu);
195
196         return rc;
197 }
198
199 /*
200  * Device passthrough support
201  */
202 static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq)
203 {
204         struct kvmppc_xive *xive = kvm->arch.xive;
205         pgoff_t esb_pgoff = KVM_XIVE_ESB_PAGE_OFFSET + irq * 2;
206
207         if (irq >= KVMPPC_XIVE_NR_IRQS)
208                 return -EINVAL;
209
210         /*
211          * Clear the ESB pages of the IRQ number being mapped (or
212          * unmapped) into the guest and let the the VM fault handler
213          * repopulate with the appropriate ESB pages (device or IC)
214          */
215         pr_debug("clearing esb pages for girq 0x%lx\n", irq);
216         mutex_lock(&xive->mapping_lock);
217         if (xive->mapping)
218                 unmap_mapping_range(xive->mapping,
219                                     esb_pgoff << PAGE_SHIFT,
220                                     2ull << PAGE_SHIFT, 1);
221         mutex_unlock(&xive->mapping_lock);
222         return 0;
223 }
224
225 static struct kvmppc_xive_ops kvmppc_xive_native_ops =  {
226         .reset_mapped = kvmppc_xive_native_reset_mapped,
227 };
228
229 static vm_fault_t xive_native_esb_fault(struct vm_fault *vmf)
230 {
231         struct vm_area_struct *vma = vmf->vma;
232         struct kvm_device *dev = vma->vm_file->private_data;
233         struct kvmppc_xive *xive = dev->private;
234         struct kvmppc_xive_src_block *sb;
235         struct kvmppc_xive_irq_state *state;
236         struct xive_irq_data *xd;
237         u32 hw_num;
238         u16 src;
239         u64 page;
240         unsigned long irq;
241         u64 page_offset;
242
243         /*
244          * Linux/KVM uses a two pages ESB setting, one for trigger and
245          * one for EOI
246          */
247         page_offset = vmf->pgoff - vma->vm_pgoff;
248         irq = page_offset / 2;
249
250         sb = kvmppc_xive_find_source(xive, irq, &src);
251         if (!sb) {
252                 pr_devel("%s: source %lx not found !\n", __func__, irq);
253                 return VM_FAULT_SIGBUS;
254         }
255
256         state = &sb->irq_state[src];
257
258         /* Some sanity checking */
259         if (!state->valid) {
260                 pr_devel("%s: source %lx invalid !\n", __func__, irq);
261                 return VM_FAULT_SIGBUS;
262         }
263
264         kvmppc_xive_select_irq(state, &hw_num, &xd);
265
266         arch_spin_lock(&sb->lock);
267
268         /*
269          * first/even page is for trigger
270          * second/odd page is for EOI and management.
271          */
272         page = page_offset % 2 ? xd->eoi_page : xd->trig_page;
273         arch_spin_unlock(&sb->lock);
274
275         if (WARN_ON(!page)) {
276                 pr_err("%s: accessing invalid ESB page for source %lx !\n",
277                        __func__, irq);
278                 return VM_FAULT_SIGBUS;
279         }
280
281         vmf_insert_pfn(vma, vmf->address, page >> PAGE_SHIFT);
282         return VM_FAULT_NOPAGE;
283 }
284
285 static const struct vm_operations_struct xive_native_esb_vmops = {
286         .fault = xive_native_esb_fault,
287 };
288
289 static vm_fault_t xive_native_tima_fault(struct vm_fault *vmf)
290 {
291         struct vm_area_struct *vma = vmf->vma;
292
293         switch (vmf->pgoff - vma->vm_pgoff) {
294         case 0: /* HW - forbid access */
295         case 1: /* HV - forbid access */
296                 return VM_FAULT_SIGBUS;
297         case 2: /* OS */
298                 vmf_insert_pfn(vma, vmf->address, xive_tima_os >> PAGE_SHIFT);
299                 return VM_FAULT_NOPAGE;
300         case 3: /* USER - TODO */
301         default:
302                 return VM_FAULT_SIGBUS;
303         }
304 }
305
306 static const struct vm_operations_struct xive_native_tima_vmops = {
307         .fault = xive_native_tima_fault,
308 };
309
310 static int kvmppc_xive_native_mmap(struct kvm_device *dev,
311                                    struct vm_area_struct *vma)
312 {
313         struct kvmppc_xive *xive = dev->private;
314
315         /* We only allow mappings at fixed offset for now */
316         if (vma->vm_pgoff == KVM_XIVE_TIMA_PAGE_OFFSET) {
317                 if (vma_pages(vma) > 4)
318                         return -EINVAL;
319                 vma->vm_ops = &xive_native_tima_vmops;
320         } else if (vma->vm_pgoff == KVM_XIVE_ESB_PAGE_OFFSET) {
321                 if (vma_pages(vma) > KVMPPC_XIVE_NR_IRQS * 2)
322                         return -EINVAL;
323                 vma->vm_ops = &xive_native_esb_vmops;
324         } else {
325                 return -EINVAL;
326         }
327
328         vma->vm_flags |= VM_IO | VM_PFNMAP;
329         vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
330
331         /*
332          * Grab the KVM device file address_space to be able to clear
333          * the ESB pages mapping when a device is passed-through into
334          * the guest.
335          */
336         xive->mapping = vma->vm_file->f_mapping;
337         return 0;
338 }
339
340 static int kvmppc_xive_native_set_source(struct kvmppc_xive *xive, long irq,
341                                          u64 addr)
342 {
343         struct kvmppc_xive_src_block *sb;
344         struct kvmppc_xive_irq_state *state;
345         u64 __user *ubufp = (u64 __user *) addr;
346         u64 val;
347         u16 idx;
348         int rc;
349
350         pr_devel("%s irq=0x%lx\n", __func__, irq);
351
352         if (irq < KVMPPC_XIVE_FIRST_IRQ || irq >= KVMPPC_XIVE_NR_IRQS)
353                 return -E2BIG;
354
355         sb = kvmppc_xive_find_source(xive, irq, &idx);
356         if (!sb) {
357                 pr_debug("No source, creating source block...\n");
358                 sb = kvmppc_xive_create_src_block(xive, irq);
359                 if (!sb) {
360                         pr_err("Failed to create block...\n");
361                         return -ENOMEM;
362                 }
363         }
364         state = &sb->irq_state[idx];
365
366         if (get_user(val, ubufp)) {
367                 pr_err("fault getting user info !\n");
368                 return -EFAULT;
369         }
370
371         arch_spin_lock(&sb->lock);
372
373         /*
374          * If the source doesn't already have an IPI, allocate
375          * one and get the corresponding data
376          */
377         if (!state->ipi_number) {
378                 state->ipi_number = xive_native_alloc_irq();
379                 if (state->ipi_number == 0) {
380                         pr_err("Failed to allocate IRQ !\n");
381                         rc = -ENXIO;
382                         goto unlock;
383                 }
384                 xive_native_populate_irq_data(state->ipi_number,
385                                               &state->ipi_data);
386                 pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__,
387                          state->ipi_number, irq);
388         }
389
390         /* Restore LSI state */
391         if (val & KVM_XIVE_LEVEL_SENSITIVE) {
392                 state->lsi = true;
393                 if (val & KVM_XIVE_LEVEL_ASSERTED)
394                         state->asserted = true;
395                 pr_devel("  LSI ! Asserted=%d\n", state->asserted);
396         }
397
398         /* Mask IRQ to start with */
399         state->act_server = 0;
400         state->act_priority = MASKED;
401         xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
402         xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
403
404         /* Increment the number of valid sources and mark this one valid */
405         if (!state->valid)
406                 xive->src_count++;
407         state->valid = true;
408
409         rc = 0;
410
411 unlock:
412         arch_spin_unlock(&sb->lock);
413
414         return rc;
415 }
416
417 static int kvmppc_xive_native_update_source_config(struct kvmppc_xive *xive,
418                                         struct kvmppc_xive_src_block *sb,
419                                         struct kvmppc_xive_irq_state *state,
420                                         u32 server, u8 priority, bool masked,
421                                         u32 eisn)
422 {
423         struct kvm *kvm = xive->kvm;
424         u32 hw_num;
425         int rc = 0;
426
427         arch_spin_lock(&sb->lock);
428
429         if (state->act_server == server && state->act_priority == priority &&
430             state->eisn == eisn)
431                 goto unlock;
432
433         pr_devel("new_act_prio=%d new_act_server=%d mask=%d act_server=%d act_prio=%d\n",
434                  priority, server, masked, state->act_server,
435                  state->act_priority);
436
437         kvmppc_xive_select_irq(state, &hw_num, NULL);
438
439         if (priority != MASKED && !masked) {
440                 rc = kvmppc_xive_select_target(kvm, &server, priority);
441                 if (rc)
442                         goto unlock;
443
444                 state->act_priority = priority;
445                 state->act_server = server;
446                 state->eisn = eisn;
447
448                 rc = xive_native_configure_irq(hw_num,
449                                                kvmppc_xive_vp(xive, server),
450                                                priority, eisn);
451         } else {
452                 state->act_priority = MASKED;
453                 state->act_server = 0;
454                 state->eisn = 0;
455
456                 rc = xive_native_configure_irq(hw_num, 0, MASKED, 0);
457         }
458
459 unlock:
460         arch_spin_unlock(&sb->lock);
461         return rc;
462 }
463
464 static int kvmppc_xive_native_set_source_config(struct kvmppc_xive *xive,
465                                                 long irq, u64 addr)
466 {
467         struct kvmppc_xive_src_block *sb;
468         struct kvmppc_xive_irq_state *state;
469         u64 __user *ubufp = (u64 __user *) addr;
470         u16 src;
471         u64 kvm_cfg;
472         u32 server;
473         u8 priority;
474         bool masked;
475         u32 eisn;
476
477         sb = kvmppc_xive_find_source(xive, irq, &src);
478         if (!sb)
479                 return -ENOENT;
480
481         state = &sb->irq_state[src];
482
483         if (!state->valid)
484                 return -EINVAL;
485
486         if (get_user(kvm_cfg, ubufp))
487                 return -EFAULT;
488
489         pr_devel("%s irq=0x%lx cfg=%016llx\n", __func__, irq, kvm_cfg);
490
491         priority = (kvm_cfg & KVM_XIVE_SOURCE_PRIORITY_MASK) >>
492                 KVM_XIVE_SOURCE_PRIORITY_SHIFT;
493         server = (kvm_cfg & KVM_XIVE_SOURCE_SERVER_MASK) >>
494                 KVM_XIVE_SOURCE_SERVER_SHIFT;
495         masked = (kvm_cfg & KVM_XIVE_SOURCE_MASKED_MASK) >>
496                 KVM_XIVE_SOURCE_MASKED_SHIFT;
497         eisn = (kvm_cfg & KVM_XIVE_SOURCE_EISN_MASK) >>
498                 KVM_XIVE_SOURCE_EISN_SHIFT;
499
500         if (priority != xive_prio_from_guest(priority)) {
501                 pr_err("invalid priority for queue %d for VCPU %d\n",
502                        priority, server);
503                 return -EINVAL;
504         }
505
506         return kvmppc_xive_native_update_source_config(xive, sb, state, server,
507                                                        priority, masked, eisn);
508 }
509
510 static int kvmppc_xive_native_sync_source(struct kvmppc_xive *xive,
511                                           long irq, u64 addr)
512 {
513         struct kvmppc_xive_src_block *sb;
514         struct kvmppc_xive_irq_state *state;
515         struct xive_irq_data *xd;
516         u32 hw_num;
517         u16 src;
518         int rc = 0;
519
520         pr_devel("%s irq=0x%lx", __func__, irq);
521
522         sb = kvmppc_xive_find_source(xive, irq, &src);
523         if (!sb)
524                 return -ENOENT;
525
526         state = &sb->irq_state[src];
527
528         rc = -EINVAL;
529
530         arch_spin_lock(&sb->lock);
531
532         if (state->valid) {
533                 kvmppc_xive_select_irq(state, &hw_num, &xd);
534                 xive_native_sync_source(hw_num);
535                 rc = 0;
536         }
537
538         arch_spin_unlock(&sb->lock);
539         return rc;
540 }
541
542 static int xive_native_validate_queue_size(u32 qshift)
543 {
544         /*
545          * We only support 64K pages for the moment. This is also
546          * advertised in the DT property "ibm,xive-eq-sizes"
547          */
548         switch (qshift) {
549         case 0: /* EQ reset */
550         case 16:
551                 return 0;
552         case 12:
553         case 21:
554         case 24:
555         default:
556                 return -EINVAL;
557         }
558 }
559
560 static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
561                                                long eq_idx, u64 addr)
562 {
563         struct kvm *kvm = xive->kvm;
564         struct kvm_vcpu *vcpu;
565         struct kvmppc_xive_vcpu *xc;
566         void __user *ubufp = (void __user *) addr;
567         u32 server;
568         u8 priority;
569         struct kvm_ppc_xive_eq kvm_eq;
570         int rc;
571         __be32 *qaddr = 0;
572         struct page *page;
573         struct xive_q *q;
574         gfn_t gfn;
575         unsigned long page_size;
576         int srcu_idx;
577
578         /*
579          * Demangle priority/server tuple from the EQ identifier
580          */
581         priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
582                 KVM_XIVE_EQ_PRIORITY_SHIFT;
583         server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
584                 KVM_XIVE_EQ_SERVER_SHIFT;
585
586         if (copy_from_user(&kvm_eq, ubufp, sizeof(kvm_eq)))
587                 return -EFAULT;
588
589         vcpu = kvmppc_xive_find_server(kvm, server);
590         if (!vcpu) {
591                 pr_err("Can't find server %d\n", server);
592                 return -ENOENT;
593         }
594         xc = vcpu->arch.xive_vcpu;
595
596         if (priority != xive_prio_from_guest(priority)) {
597                 pr_err("Trying to restore invalid queue %d for VCPU %d\n",
598                        priority, server);
599                 return -EINVAL;
600         }
601         q = &xc->queues[priority];
602
603         pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
604                  __func__, server, priority, kvm_eq.flags,
605                  kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
606
607         /* reset queue and disable queueing */
608         if (!kvm_eq.qshift) {
609                 q->guest_qaddr  = 0;
610                 q->guest_qshift = 0;
611
612                 rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority,
613                                                         NULL, 0, true);
614                 if (rc) {
615                         pr_err("Failed to reset queue %d for VCPU %d: %d\n",
616                                priority, xc->server_num, rc);
617                         return rc;
618                 }
619
620                 return 0;
621         }
622
623         /*
624          * sPAPR specifies a "Unconditional Notify (n) flag" for the
625          * H_INT_SET_QUEUE_CONFIG hcall which forces notification
626          * without using the coalescing mechanisms provided by the
627          * XIVE END ESBs. This is required on KVM as notification
628          * using the END ESBs is not supported.
629          */
630         if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) {
631                 pr_err("invalid flags %d\n", kvm_eq.flags);
632                 return -EINVAL;
633         }
634
635         rc = xive_native_validate_queue_size(kvm_eq.qshift);
636         if (rc) {
637                 pr_err("invalid queue size %d\n", kvm_eq.qshift);
638                 return rc;
639         }
640
641         if (kvm_eq.qaddr & ((1ull << kvm_eq.qshift) - 1)) {
642                 pr_err("queue page is not aligned %llx/%llx\n", kvm_eq.qaddr,
643                        1ull << kvm_eq.qshift);
644                 return -EINVAL;
645         }
646
647         srcu_idx = srcu_read_lock(&kvm->srcu);
648         gfn = gpa_to_gfn(kvm_eq.qaddr);
649
650         page_size = kvm_host_page_size(vcpu, gfn);
651         if (1ull << kvm_eq.qshift > page_size) {
652                 srcu_read_unlock(&kvm->srcu, srcu_idx);
653                 pr_warn("Incompatible host page size %lx!\n", page_size);
654                 return -EINVAL;
655         }
656
657         page = gfn_to_page(kvm, gfn);
658         if (is_error_page(page)) {
659                 srcu_read_unlock(&kvm->srcu, srcu_idx);
660                 pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr);
661                 return -EINVAL;
662         }
663
664         qaddr = page_to_virt(page) + (kvm_eq.qaddr & ~PAGE_MASK);
665         srcu_read_unlock(&kvm->srcu, srcu_idx);
666
667         /*
668          * Backup the queue page guest address to the mark EQ page
669          * dirty for migration.
670          */
671         q->guest_qaddr  = kvm_eq.qaddr;
672         q->guest_qshift = kvm_eq.qshift;
673
674          /*
675           * Unconditional Notification is forced by default at the
676           * OPAL level because the use of END ESBs is not supported by
677           * Linux.
678           */
679         rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority,
680                                         (__be32 *) qaddr, kvm_eq.qshift, true);
681         if (rc) {
682                 pr_err("Failed to configure queue %d for VCPU %d: %d\n",
683                        priority, xc->server_num, rc);
684                 put_page(page);
685                 return rc;
686         }
687
688         /*
689          * Only restore the queue state when needed. When doing the
690          * H_INT_SET_SOURCE_CONFIG hcall, it should not.
691          */
692         if (kvm_eq.qtoggle != 1 || kvm_eq.qindex != 0) {
693                 rc = xive_native_set_queue_state(xc->vp_id, priority,
694                                                  kvm_eq.qtoggle,
695                                                  kvm_eq.qindex);
696                 if (rc)
697                         goto error;
698         }
699
700         rc = kvmppc_xive_attach_escalation(vcpu, priority,
701                                            kvmppc_xive_has_single_escalation(xive));
702 error:
703         if (rc)
704                 kvmppc_xive_native_cleanup_queue(vcpu, priority);
705         return rc;
706 }
707
708 static int kvmppc_xive_native_get_queue_config(struct kvmppc_xive *xive,
709                                                long eq_idx, u64 addr)
710 {
711         struct kvm *kvm = xive->kvm;
712         struct kvm_vcpu *vcpu;
713         struct kvmppc_xive_vcpu *xc;
714         struct xive_q *q;
715         void __user *ubufp = (u64 __user *) addr;
716         u32 server;
717         u8 priority;
718         struct kvm_ppc_xive_eq kvm_eq;
719         u64 qaddr;
720         u64 qshift;
721         u64 qeoi_page;
722         u32 escalate_irq;
723         u64 qflags;
724         int rc;
725
726         /*
727          * Demangle priority/server tuple from the EQ identifier
728          */
729         priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
730                 KVM_XIVE_EQ_PRIORITY_SHIFT;
731         server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
732                 KVM_XIVE_EQ_SERVER_SHIFT;
733
734         vcpu = kvmppc_xive_find_server(kvm, server);
735         if (!vcpu) {
736                 pr_err("Can't find server %d\n", server);
737                 return -ENOENT;
738         }
739         xc = vcpu->arch.xive_vcpu;
740
741         if (priority != xive_prio_from_guest(priority)) {
742                 pr_err("invalid priority for queue %d for VCPU %d\n",
743                        priority, server);
744                 return -EINVAL;
745         }
746         q = &xc->queues[priority];
747
748         memset(&kvm_eq, 0, sizeof(kvm_eq));
749
750         if (!q->qpage)
751                 return 0;
752
753         rc = xive_native_get_queue_info(xc->vp_id, priority, &qaddr, &qshift,
754                                         &qeoi_page, &escalate_irq, &qflags);
755         if (rc)
756                 return rc;
757
758         kvm_eq.flags = 0;
759         if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY)
760                 kvm_eq.flags |= KVM_XIVE_EQ_ALWAYS_NOTIFY;
761
762         kvm_eq.qshift = q->guest_qshift;
763         kvm_eq.qaddr  = q->guest_qaddr;
764
765         rc = xive_native_get_queue_state(xc->vp_id, priority, &kvm_eq.qtoggle,
766                                          &kvm_eq.qindex);
767         if (rc)
768                 return rc;
769
770         pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
771                  __func__, server, priority, kvm_eq.flags,
772                  kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
773
774         if (copy_to_user(ubufp, &kvm_eq, sizeof(kvm_eq)))
775                 return -EFAULT;
776
777         return 0;
778 }
779
780 static void kvmppc_xive_reset_sources(struct kvmppc_xive_src_block *sb)
781 {
782         int i;
783
784         for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
785                 struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
786
787                 if (!state->valid)
788                         continue;
789
790                 if (state->act_priority == MASKED)
791                         continue;
792
793                 state->eisn = 0;
794                 state->act_server = 0;
795                 state->act_priority = MASKED;
796                 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
797                 xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
798                 if (state->pt_number) {
799                         xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
800                         xive_native_configure_irq(state->pt_number,
801                                                   0, MASKED, 0);
802                 }
803         }
804 }
805
806 static int kvmppc_xive_reset(struct kvmppc_xive *xive)
807 {
808         struct kvm *kvm = xive->kvm;
809         struct kvm_vcpu *vcpu;
810         unsigned int i;
811
812         pr_devel("%s\n", __func__);
813
814         mutex_lock(&xive->lock);
815
816         kvm_for_each_vcpu(i, vcpu, kvm) {
817                 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
818                 unsigned int prio;
819
820                 if (!xc)
821                         continue;
822
823                 kvmppc_xive_disable_vcpu_interrupts(vcpu);
824
825                 for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
826
827                         /* Single escalation, no queue 7 */
828                         if (prio == 7 && kvmppc_xive_has_single_escalation(xive))
829                                 break;
830
831                         if (xc->esc_virq[prio]) {
832                                 free_irq(xc->esc_virq[prio], vcpu);
833                                 irq_dispose_mapping(xc->esc_virq[prio]);
834                                 kfree(xc->esc_virq_names[prio]);
835                                 xc->esc_virq[prio] = 0;
836                         }
837
838                         kvmppc_xive_native_cleanup_queue(vcpu, prio);
839                 }
840         }
841
842         for (i = 0; i <= xive->max_sbid; i++) {
843                 struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
844
845                 if (sb) {
846                         arch_spin_lock(&sb->lock);
847                         kvmppc_xive_reset_sources(sb);
848                         arch_spin_unlock(&sb->lock);
849                 }
850         }
851
852         mutex_unlock(&xive->lock);
853
854         return 0;
855 }
856
857 static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block *sb)
858 {
859         int j;
860
861         for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
862                 struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
863                 struct xive_irq_data *xd;
864                 u32 hw_num;
865
866                 if (!state->valid)
867                         continue;
868
869                 /*
870                  * The struct kvmppc_xive_irq_state reflects the state
871                  * of the EAS configuration and not the state of the
872                  * source. The source is masked setting the PQ bits to
873                  * '-Q', which is what is being done before calling
874                  * the KVM_DEV_XIVE_EQ_SYNC control.
875                  *
876                  * If a source EAS is configured, OPAL syncs the XIVE
877                  * IC of the source and the XIVE IC of the previous
878                  * target if any.
879                  *
880                  * So it should be fine ignoring MASKED sources as
881                  * they have been synced already.
882                  */
883                 if (state->act_priority == MASKED)
884                         continue;
885
886                 kvmppc_xive_select_irq(state, &hw_num, &xd);
887                 xive_native_sync_source(hw_num);
888                 xive_native_sync_queue(hw_num);
889         }
890 }
891
892 static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu)
893 {
894         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
895         unsigned int prio;
896         int srcu_idx;
897
898         if (!xc)
899                 return -ENOENT;
900
901         for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
902                 struct xive_q *q = &xc->queues[prio];
903
904                 if (!q->qpage)
905                         continue;
906
907                 /* Mark EQ page dirty for migration */
908                 srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
909                 mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qaddr));
910                 srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
911         }
912         return 0;
913 }
914
915 static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive)
916 {
917         struct kvm *kvm = xive->kvm;
918         struct kvm_vcpu *vcpu;
919         unsigned int i;
920
921         pr_devel("%s\n", __func__);
922
923         mutex_lock(&xive->lock);
924         for (i = 0; i <= xive->max_sbid; i++) {
925                 struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
926
927                 if (sb) {
928                         arch_spin_lock(&sb->lock);
929                         kvmppc_xive_native_sync_sources(sb);
930                         arch_spin_unlock(&sb->lock);
931                 }
932         }
933
934         kvm_for_each_vcpu(i, vcpu, kvm) {
935                 kvmppc_xive_native_vcpu_eq_sync(vcpu);
936         }
937         mutex_unlock(&xive->lock);
938
939         return 0;
940 }
941
942 static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
943                                        struct kvm_device_attr *attr)
944 {
945         struct kvmppc_xive *xive = dev->private;
946
947         switch (attr->group) {
948         case KVM_DEV_XIVE_GRP_CTRL:
949                 switch (attr->attr) {
950                 case KVM_DEV_XIVE_RESET:
951                         return kvmppc_xive_reset(xive);
952                 case KVM_DEV_XIVE_EQ_SYNC:
953                         return kvmppc_xive_native_eq_sync(xive);
954                 case KVM_DEV_XIVE_NR_SERVERS:
955                         return kvmppc_xive_set_nr_servers(xive, attr->addr);
956                 }
957                 break;
958         case KVM_DEV_XIVE_GRP_SOURCE:
959                 return kvmppc_xive_native_set_source(xive, attr->attr,
960                                                      attr->addr);
961         case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
962                 return kvmppc_xive_native_set_source_config(xive, attr->attr,
963                                                             attr->addr);
964         case KVM_DEV_XIVE_GRP_EQ_CONFIG:
965                 return kvmppc_xive_native_set_queue_config(xive, attr->attr,
966                                                            attr->addr);
967         case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
968                 return kvmppc_xive_native_sync_source(xive, attr->attr,
969                                                       attr->addr);
970         }
971         return -ENXIO;
972 }
973
974 static int kvmppc_xive_native_get_attr(struct kvm_device *dev,
975                                        struct kvm_device_attr *attr)
976 {
977         struct kvmppc_xive *xive = dev->private;
978
979         switch (attr->group) {
980         case KVM_DEV_XIVE_GRP_EQ_CONFIG:
981                 return kvmppc_xive_native_get_queue_config(xive, attr->attr,
982                                                            attr->addr);
983         }
984         return -ENXIO;
985 }
986
987 static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
988                                        struct kvm_device_attr *attr)
989 {
990         switch (attr->group) {
991         case KVM_DEV_XIVE_GRP_CTRL:
992                 switch (attr->attr) {
993                 case KVM_DEV_XIVE_RESET:
994                 case KVM_DEV_XIVE_EQ_SYNC:
995                 case KVM_DEV_XIVE_NR_SERVERS:
996                         return 0;
997                 }
998                 break;
999         case KVM_DEV_XIVE_GRP_SOURCE:
1000         case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
1001         case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
1002                 if (attr->attr >= KVMPPC_XIVE_FIRST_IRQ &&
1003                     attr->attr < KVMPPC_XIVE_NR_IRQS)
1004                         return 0;
1005                 break;
1006         case KVM_DEV_XIVE_GRP_EQ_CONFIG:
1007                 return 0;
1008         }
1009         return -ENXIO;
1010 }
1011
1012 /*
1013  * Called when device fd is closed.  kvm->lock is held.
1014  */
1015 static void kvmppc_xive_native_release(struct kvm_device *dev)
1016 {
1017         struct kvmppc_xive *xive = dev->private;
1018         struct kvm *kvm = xive->kvm;
1019         struct kvm_vcpu *vcpu;
1020         int i;
1021
1022         pr_devel("Releasing xive native device\n");
1023
1024         /*
1025          * Clear the KVM device file address_space which is used to
1026          * unmap the ESB pages when a device is passed-through.
1027          */
1028         mutex_lock(&xive->mapping_lock);
1029         xive->mapping = NULL;
1030         mutex_unlock(&xive->mapping_lock);
1031
1032         /*
1033          * Since this is the device release function, we know that
1034          * userspace does not have any open fd or mmap referring to
1035          * the device.  Therefore there can not be any of the
1036          * device attribute set/get, mmap, or page fault functions
1037          * being executed concurrently, and similarly, the
1038          * connect_vcpu and set/clr_mapped functions also cannot
1039          * be being executed.
1040          */
1041
1042         debugfs_remove(xive->dentry);
1043
1044         /*
1045          * We should clean up the vCPU interrupt presenters first.
1046          */
1047         kvm_for_each_vcpu(i, vcpu, kvm) {
1048                 /*
1049                  * Take vcpu->mutex to ensure that no one_reg get/set ioctl
1050                  * (i.e. kvmppc_xive_native_[gs]et_vp) can be being done.
1051                  * Holding the vcpu->mutex also means that the vcpu cannot
1052                  * be executing the KVM_RUN ioctl, and therefore it cannot
1053                  * be executing the XIVE push or pull code or accessing
1054                  * the XIVE MMIO regions.
1055                  */
1056                 mutex_lock(&vcpu->mutex);
1057                 kvmppc_xive_native_cleanup_vcpu(vcpu);
1058                 mutex_unlock(&vcpu->mutex);
1059         }
1060
1061         /*
1062          * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type
1063          * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe
1064          * against xive code getting called during vcpu execution or
1065          * set/get one_reg operations.
1066          */
1067         kvm->arch.xive = NULL;
1068
1069         for (i = 0; i <= xive->max_sbid; i++) {
1070                 if (xive->src_blocks[i])
1071                         kvmppc_xive_free_sources(xive->src_blocks[i]);
1072                 kfree(xive->src_blocks[i]);
1073                 xive->src_blocks[i] = NULL;
1074         }
1075
1076         if (xive->vp_base != XIVE_INVALID_VP)
1077                 xive_native_free_vp_block(xive->vp_base);
1078
1079         /*
1080          * A reference of the kvmppc_xive pointer is now kept under
1081          * the xive_devices struct of the machine for reuse. It is
1082          * freed when the VM is destroyed for now until we fix all the
1083          * execution paths.
1084          */
1085
1086         kfree(dev);
1087 }
1088
1089 /*
1090  * Create a XIVE device.  kvm->lock is held.
1091  */
1092 static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
1093 {
1094         struct kvmppc_xive *xive;
1095         struct kvm *kvm = dev->kvm;
1096
1097         pr_devel("Creating xive native device\n");
1098
1099         if (kvm->arch.xive)
1100                 return -EEXIST;
1101
1102         xive = kvmppc_xive_get_device(kvm, type);
1103         if (!xive)
1104                 return -ENOMEM;
1105
1106         dev->private = xive;
1107         xive->dev = dev;
1108         xive->kvm = kvm;
1109         mutex_init(&xive->mapping_lock);
1110         mutex_init(&xive->lock);
1111
1112         /* VP allocation is delayed to the first call to connect_vcpu */
1113         xive->vp_base = XIVE_INVALID_VP;
1114         /* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets
1115          * on a POWER9 system.
1116          */
1117         xive->nr_servers = KVM_MAX_VCPUS;
1118
1119         if (xive_native_has_single_escalation())
1120                 xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION;
1121
1122         if (xive_native_has_save_restore())
1123                 xive->flags |= KVMPPC_XIVE_FLAG_SAVE_RESTORE;
1124
1125         xive->ops = &kvmppc_xive_native_ops;
1126
1127         kvm->arch.xive = xive;
1128         return 0;
1129 }
1130
1131 /*
1132  * Interrupt Pending Buffer (IPB) offset
1133  */
1134 #define TM_IPB_SHIFT 40
1135 #define TM_IPB_MASK  (((u64) 0xFF) << TM_IPB_SHIFT)
1136
1137 int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1138 {
1139         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1140         u64 opal_state;
1141         int rc;
1142
1143         if (!kvmppc_xive_enabled(vcpu))
1144                 return -EPERM;
1145
1146         if (!xc)
1147                 return -ENOENT;
1148
1149         /* Thread context registers. We only care about IPB and CPPR */
1150         val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01;
1151
1152         /* Get the VP state from OPAL */
1153         rc = xive_native_get_vp_state(xc->vp_id, &opal_state);
1154         if (rc)
1155                 return rc;
1156
1157         /*
1158          * Capture the backup of IPB register in the NVT structure and
1159          * merge it in our KVM VP state.
1160          */
1161         val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK);
1162
1163         pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n",
1164                  __func__,
1165                  vcpu->arch.xive_saved_state.nsr,
1166                  vcpu->arch.xive_saved_state.cppr,
1167                  vcpu->arch.xive_saved_state.ipb,
1168                  vcpu->arch.xive_saved_state.pipr,
1169                  vcpu->arch.xive_saved_state.w01,
1170                  (u32) vcpu->arch.xive_cam_word, opal_state);
1171
1172         return 0;
1173 }
1174
1175 int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1176 {
1177         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1178         struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
1179
1180         pr_devel("%s w01=%016llx vp=%016llx\n", __func__,
1181                  val->xive_timaval[0], val->xive_timaval[1]);
1182
1183         if (!kvmppc_xive_enabled(vcpu))
1184                 return -EPERM;
1185
1186         if (!xc || !xive)
1187                 return -ENOENT;
1188
1189         /* We can't update the state of a "pushed" VCPU  */
1190         if (WARN_ON(vcpu->arch.xive_pushed))
1191                 return -EBUSY;
1192
1193         /*
1194          * Restore the thread context registers. IPB and CPPR should
1195          * be the only ones that matter.
1196          */
1197         vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0];
1198
1199         /*
1200          * There is no need to restore the XIVE internal state (IPB
1201          * stored in the NVT) as the IPB register was merged in KVM VP
1202          * state when captured.
1203          */
1204         return 0;
1205 }
1206
1207 bool kvmppc_xive_native_supported(void)
1208 {
1209         return xive_native_has_queue_state_support();
1210 }
1211
1212 static int xive_native_debug_show(struct seq_file *m, void *private)
1213 {
1214         struct kvmppc_xive *xive = m->private;
1215         struct kvm *kvm = xive->kvm;
1216         struct kvm_vcpu *vcpu;
1217         unsigned int i;
1218
1219         if (!kvm)
1220                 return 0;
1221
1222         seq_puts(m, "=========\nVCPU state\n=========\n");
1223
1224         kvm_for_each_vcpu(i, vcpu, kvm) {
1225                 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1226
1227                 if (!xc)
1228                         continue;
1229
1230                 seq_printf(m, "VCPU %d: VP=%#x/%02x\n"
1231                            "    NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n",
1232                            xc->server_num, xc->vp_id, xc->vp_chip_id,
1233                            vcpu->arch.xive_saved_state.nsr,
1234                            vcpu->arch.xive_saved_state.cppr,
1235                            vcpu->arch.xive_saved_state.ipb,
1236                            vcpu->arch.xive_saved_state.pipr,
1237                            be64_to_cpu(vcpu->arch.xive_saved_state.w01),
1238                            be32_to_cpu(vcpu->arch.xive_cam_word));
1239
1240                 kvmppc_xive_debug_show_queues(m, vcpu);
1241         }
1242
1243         seq_puts(m, "=========\nSources\n=========\n");
1244
1245         for (i = 0; i <= xive->max_sbid; i++) {
1246                 struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
1247
1248                 if (sb) {
1249                         arch_spin_lock(&sb->lock);
1250                         kvmppc_xive_debug_show_sources(m, sb);
1251                         arch_spin_unlock(&sb->lock);
1252                 }
1253         }
1254
1255         return 0;
1256 }
1257
1258 DEFINE_SHOW_ATTRIBUTE(xive_native_debug);
1259
1260 static void xive_native_debugfs_init(struct kvmppc_xive *xive)
1261 {
1262         char *name;
1263
1264         name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive);
1265         if (!name) {
1266                 pr_err("%s: no memory for name\n", __func__);
1267                 return;
1268         }
1269
1270         xive->dentry = debugfs_create_file(name, 0444, arch_debugfs_dir,
1271                                            xive, &xive_native_debug_fops);
1272
1273         pr_debug("%s: created %s\n", __func__, name);
1274         kfree(name);
1275 }
1276
1277 static void kvmppc_xive_native_init(struct kvm_device *dev)
1278 {
1279         struct kvmppc_xive *xive = (struct kvmppc_xive *)dev->private;
1280
1281         /* Register some debug interfaces */
1282         xive_native_debugfs_init(xive);
1283 }
1284
1285 struct kvm_device_ops kvm_xive_native_ops = {
1286         .name = "kvm-xive-native",
1287         .create = kvmppc_xive_native_create,
1288         .init = kvmppc_xive_native_init,
1289         .release = kvmppc_xive_native_release,
1290         .set_attr = kvmppc_xive_native_set_attr,
1291         .get_attr = kvmppc_xive_native_get_attr,
1292         .has_attr = kvmppc_xive_native_has_attr,
1293         .mmap = kvmppc_xive_native_mmap,
1294 };