KVM: PPC: Book3S HV: Add KVM_CAP_PPC_RPT_INVALIDATE capability
[linux-2.6-microblaze.git] / arch / powerpc / kvm / book3s_xive_native.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2017-2019, IBM Corporation.
4  */
5
6 #define pr_fmt(fmt) "xive-kvm: " fmt
7
8 #include <linux/kernel.h>
9 #include <linux/kvm_host.h>
10 #include <linux/err.h>
11 #include <linux/gfp.h>
12 #include <linux/spinlock.h>
13 #include <linux/delay.h>
14 #include <linux/file.h>
15 #include <asm/uaccess.h>
16 #include <asm/kvm_book3s.h>
17 #include <asm/kvm_ppc.h>
18 #include <asm/hvcall.h>
19 #include <asm/xive.h>
20 #include <asm/xive-regs.h>
21 #include <asm/debug.h>
22 #include <asm/debugfs.h>
23 #include <asm/opal.h>
24
25 #include <linux/debugfs.h>
26 #include <linux/seq_file.h>
27
28 #include "book3s_xive.h"
29
30 static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
31 {
32         u64 val;
33
34         /*
35          * The KVM XIVE native device does not use the XIVE_ESB_SET_PQ_10
36          * load operation, so there is no need to enforce load-after-store
37          * ordering.
38          */
39
40         val = in_be64(xd->eoi_mmio + offset);
41         return (u8)val;
42 }
43
44 static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio)
45 {
46         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
47         struct xive_q *q = &xc->queues[prio];
48
49         xive_native_disable_queue(xc->vp_id, q, prio);
50         if (q->qpage) {
51                 put_page(virt_to_page(q->qpage));
52                 q->qpage = NULL;
53         }
54 }
55
56 static int kvmppc_xive_native_configure_queue(u32 vp_id, struct xive_q *q,
57                                               u8 prio, __be32 *qpage,
58                                               u32 order, bool can_escalate)
59 {
60         int rc;
61         __be32 *qpage_prev = q->qpage;
62
63         rc = xive_native_configure_queue(vp_id, q, prio, qpage, order,
64                                          can_escalate);
65         if (rc)
66                 return rc;
67
68         if (qpage_prev)
69                 put_page(virt_to_page(qpage_prev));
70
71         return rc;
72 }
73
74 void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
75 {
76         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
77         int i;
78
79         if (!kvmppc_xive_enabled(vcpu))
80                 return;
81
82         if (!xc)
83                 return;
84
85         pr_devel("native_cleanup_vcpu(cpu=%d)\n", xc->server_num);
86
87         /* Ensure no interrupt is still routed to that VP */
88         xc->valid = false;
89         kvmppc_xive_disable_vcpu_interrupts(vcpu);
90
91         /* Free escalations */
92         for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
93                 /* Free the escalation irq */
94                 if (xc->esc_virq[i]) {
95                         if (xc->xive->single_escalation)
96                                 xive_cleanup_single_escalation(vcpu, xc,
97                                                         xc->esc_virq[i]);
98                         free_irq(xc->esc_virq[i], vcpu);
99                         irq_dispose_mapping(xc->esc_virq[i]);
100                         kfree(xc->esc_virq_names[i]);
101                         xc->esc_virq[i] = 0;
102                 }
103         }
104
105         /* Disable the VP */
106         xive_native_disable_vp(xc->vp_id);
107
108         /* Clear the cam word so guest entry won't try to push context */
109         vcpu->arch.xive_cam_word = 0;
110
111         /* Free the queues */
112         for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
113                 kvmppc_xive_native_cleanup_queue(vcpu, i);
114         }
115
116         /* Free the VP */
117         kfree(xc);
118
119         /* Cleanup the vcpu */
120         vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
121         vcpu->arch.xive_vcpu = NULL;
122 }
123
124 int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
125                                     struct kvm_vcpu *vcpu, u32 server_num)
126 {
127         struct kvmppc_xive *xive = dev->private;
128         struct kvmppc_xive_vcpu *xc = NULL;
129         int rc;
130         u32 vp_id;
131
132         pr_devel("native_connect_vcpu(server=%d)\n", server_num);
133
134         if (dev->ops != &kvm_xive_native_ops) {
135                 pr_devel("Wrong ops !\n");
136                 return -EPERM;
137         }
138         if (xive->kvm != vcpu->kvm)
139                 return -EPERM;
140         if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
141                 return -EBUSY;
142
143         mutex_lock(&xive->lock);
144
145         rc = kvmppc_xive_compute_vp_id(xive, server_num, &vp_id);
146         if (rc)
147                 goto bail;
148
149         xc = kzalloc(sizeof(*xc), GFP_KERNEL);
150         if (!xc) {
151                 rc = -ENOMEM;
152                 goto bail;
153         }
154
155         vcpu->arch.xive_vcpu = xc;
156         xc->xive = xive;
157         xc->vcpu = vcpu;
158         xc->server_num = server_num;
159
160         xc->vp_id = vp_id;
161         xc->valid = true;
162         vcpu->arch.irq_type = KVMPPC_IRQ_XIVE;
163
164         rc = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
165         if (rc) {
166                 pr_err("Failed to get VP info from OPAL: %d\n", rc);
167                 goto bail;
168         }
169
170         /*
171          * Enable the VP first as the single escalation mode will
172          * affect escalation interrupts numbering
173          */
174         rc = xive_native_enable_vp(xc->vp_id, xive->single_escalation);
175         if (rc) {
176                 pr_err("Failed to enable VP in OPAL: %d\n", rc);
177                 goto bail;
178         }
179
180         /* Configure VCPU fields for use by assembly push/pull */
181         vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
182         vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
183
184         /* TODO: reset all queues to a clean state ? */
185 bail:
186         mutex_unlock(&xive->lock);
187         if (rc)
188                 kvmppc_xive_native_cleanup_vcpu(vcpu);
189
190         return rc;
191 }
192
193 /*
194  * Device passthrough support
195  */
196 static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq)
197 {
198         struct kvmppc_xive *xive = kvm->arch.xive;
199         pgoff_t esb_pgoff = KVM_XIVE_ESB_PAGE_OFFSET + irq * 2;
200
201         if (irq >= KVMPPC_XIVE_NR_IRQS)
202                 return -EINVAL;
203
204         /*
205          * Clear the ESB pages of the IRQ number being mapped (or
206          * unmapped) into the guest and let the the VM fault handler
207          * repopulate with the appropriate ESB pages (device or IC)
208          */
209         pr_debug("clearing esb pages for girq 0x%lx\n", irq);
210         mutex_lock(&xive->mapping_lock);
211         if (xive->mapping)
212                 unmap_mapping_range(xive->mapping,
213                                     esb_pgoff << PAGE_SHIFT,
214                                     2ull << PAGE_SHIFT, 1);
215         mutex_unlock(&xive->mapping_lock);
216         return 0;
217 }
218
219 static struct kvmppc_xive_ops kvmppc_xive_native_ops =  {
220         .reset_mapped = kvmppc_xive_native_reset_mapped,
221 };
222
223 static vm_fault_t xive_native_esb_fault(struct vm_fault *vmf)
224 {
225         struct vm_area_struct *vma = vmf->vma;
226         struct kvm_device *dev = vma->vm_file->private_data;
227         struct kvmppc_xive *xive = dev->private;
228         struct kvmppc_xive_src_block *sb;
229         struct kvmppc_xive_irq_state *state;
230         struct xive_irq_data *xd;
231         u32 hw_num;
232         u16 src;
233         u64 page;
234         unsigned long irq;
235         u64 page_offset;
236
237         /*
238          * Linux/KVM uses a two pages ESB setting, one for trigger and
239          * one for EOI
240          */
241         page_offset = vmf->pgoff - vma->vm_pgoff;
242         irq = page_offset / 2;
243
244         sb = kvmppc_xive_find_source(xive, irq, &src);
245         if (!sb) {
246                 pr_devel("%s: source %lx not found !\n", __func__, irq);
247                 return VM_FAULT_SIGBUS;
248         }
249
250         state = &sb->irq_state[src];
251
252         /* Some sanity checking */
253         if (!state->valid) {
254                 pr_devel("%s: source %lx invalid !\n", __func__, irq);
255                 return VM_FAULT_SIGBUS;
256         }
257
258         kvmppc_xive_select_irq(state, &hw_num, &xd);
259
260         arch_spin_lock(&sb->lock);
261
262         /*
263          * first/even page is for trigger
264          * second/odd page is for EOI and management.
265          */
266         page = page_offset % 2 ? xd->eoi_page : xd->trig_page;
267         arch_spin_unlock(&sb->lock);
268
269         if (WARN_ON(!page)) {
270                 pr_err("%s: accessing invalid ESB page for source %lx !\n",
271                        __func__, irq);
272                 return VM_FAULT_SIGBUS;
273         }
274
275         vmf_insert_pfn(vma, vmf->address, page >> PAGE_SHIFT);
276         return VM_FAULT_NOPAGE;
277 }
278
279 static const struct vm_operations_struct xive_native_esb_vmops = {
280         .fault = xive_native_esb_fault,
281 };
282
283 static vm_fault_t xive_native_tima_fault(struct vm_fault *vmf)
284 {
285         struct vm_area_struct *vma = vmf->vma;
286
287         switch (vmf->pgoff - vma->vm_pgoff) {
288         case 0: /* HW - forbid access */
289         case 1: /* HV - forbid access */
290                 return VM_FAULT_SIGBUS;
291         case 2: /* OS */
292                 vmf_insert_pfn(vma, vmf->address, xive_tima_os >> PAGE_SHIFT);
293                 return VM_FAULT_NOPAGE;
294         case 3: /* USER - TODO */
295         default:
296                 return VM_FAULT_SIGBUS;
297         }
298 }
299
300 static const struct vm_operations_struct xive_native_tima_vmops = {
301         .fault = xive_native_tima_fault,
302 };
303
304 static int kvmppc_xive_native_mmap(struct kvm_device *dev,
305                                    struct vm_area_struct *vma)
306 {
307         struct kvmppc_xive *xive = dev->private;
308
309         /* We only allow mappings at fixed offset for now */
310         if (vma->vm_pgoff == KVM_XIVE_TIMA_PAGE_OFFSET) {
311                 if (vma_pages(vma) > 4)
312                         return -EINVAL;
313                 vma->vm_ops = &xive_native_tima_vmops;
314         } else if (vma->vm_pgoff == KVM_XIVE_ESB_PAGE_OFFSET) {
315                 if (vma_pages(vma) > KVMPPC_XIVE_NR_IRQS * 2)
316                         return -EINVAL;
317                 vma->vm_ops = &xive_native_esb_vmops;
318         } else {
319                 return -EINVAL;
320         }
321
322         vma->vm_flags |= VM_IO | VM_PFNMAP;
323         vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
324
325         /*
326          * Grab the KVM device file address_space to be able to clear
327          * the ESB pages mapping when a device is passed-through into
328          * the guest.
329          */
330         xive->mapping = vma->vm_file->f_mapping;
331         return 0;
332 }
333
334 static int kvmppc_xive_native_set_source(struct kvmppc_xive *xive, long irq,
335                                          u64 addr)
336 {
337         struct kvmppc_xive_src_block *sb;
338         struct kvmppc_xive_irq_state *state;
339         u64 __user *ubufp = (u64 __user *) addr;
340         u64 val;
341         u16 idx;
342         int rc;
343
344         pr_devel("%s irq=0x%lx\n", __func__, irq);
345
346         if (irq < KVMPPC_XIVE_FIRST_IRQ || irq >= KVMPPC_XIVE_NR_IRQS)
347                 return -E2BIG;
348
349         sb = kvmppc_xive_find_source(xive, irq, &idx);
350         if (!sb) {
351                 pr_debug("No source, creating source block...\n");
352                 sb = kvmppc_xive_create_src_block(xive, irq);
353                 if (!sb) {
354                         pr_err("Failed to create block...\n");
355                         return -ENOMEM;
356                 }
357         }
358         state = &sb->irq_state[idx];
359
360         if (get_user(val, ubufp)) {
361                 pr_err("fault getting user info !\n");
362                 return -EFAULT;
363         }
364
365         arch_spin_lock(&sb->lock);
366
367         /*
368          * If the source doesn't already have an IPI, allocate
369          * one and get the corresponding data
370          */
371         if (!state->ipi_number) {
372                 state->ipi_number = xive_native_alloc_irq();
373                 if (state->ipi_number == 0) {
374                         pr_err("Failed to allocate IRQ !\n");
375                         rc = -ENXIO;
376                         goto unlock;
377                 }
378                 xive_native_populate_irq_data(state->ipi_number,
379                                               &state->ipi_data);
380                 pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__,
381                          state->ipi_number, irq);
382         }
383
384         /* Restore LSI state */
385         if (val & KVM_XIVE_LEVEL_SENSITIVE) {
386                 state->lsi = true;
387                 if (val & KVM_XIVE_LEVEL_ASSERTED)
388                         state->asserted = true;
389                 pr_devel("  LSI ! Asserted=%d\n", state->asserted);
390         }
391
392         /* Mask IRQ to start with */
393         state->act_server = 0;
394         state->act_priority = MASKED;
395         xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
396         xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
397
398         /* Increment the number of valid sources and mark this one valid */
399         if (!state->valid)
400                 xive->src_count++;
401         state->valid = true;
402
403         rc = 0;
404
405 unlock:
406         arch_spin_unlock(&sb->lock);
407
408         return rc;
409 }
410
411 static int kvmppc_xive_native_update_source_config(struct kvmppc_xive *xive,
412                                         struct kvmppc_xive_src_block *sb,
413                                         struct kvmppc_xive_irq_state *state,
414                                         u32 server, u8 priority, bool masked,
415                                         u32 eisn)
416 {
417         struct kvm *kvm = xive->kvm;
418         u32 hw_num;
419         int rc = 0;
420
421         arch_spin_lock(&sb->lock);
422
423         if (state->act_server == server && state->act_priority == priority &&
424             state->eisn == eisn)
425                 goto unlock;
426
427         pr_devel("new_act_prio=%d new_act_server=%d mask=%d act_server=%d act_prio=%d\n",
428                  priority, server, masked, state->act_server,
429                  state->act_priority);
430
431         kvmppc_xive_select_irq(state, &hw_num, NULL);
432
433         if (priority != MASKED && !masked) {
434                 rc = kvmppc_xive_select_target(kvm, &server, priority);
435                 if (rc)
436                         goto unlock;
437
438                 state->act_priority = priority;
439                 state->act_server = server;
440                 state->eisn = eisn;
441
442                 rc = xive_native_configure_irq(hw_num,
443                                                kvmppc_xive_vp(xive, server),
444                                                priority, eisn);
445         } else {
446                 state->act_priority = MASKED;
447                 state->act_server = 0;
448                 state->eisn = 0;
449
450                 rc = xive_native_configure_irq(hw_num, 0, MASKED, 0);
451         }
452
453 unlock:
454         arch_spin_unlock(&sb->lock);
455         return rc;
456 }
457
458 static int kvmppc_xive_native_set_source_config(struct kvmppc_xive *xive,
459                                                 long irq, u64 addr)
460 {
461         struct kvmppc_xive_src_block *sb;
462         struct kvmppc_xive_irq_state *state;
463         u64 __user *ubufp = (u64 __user *) addr;
464         u16 src;
465         u64 kvm_cfg;
466         u32 server;
467         u8 priority;
468         bool masked;
469         u32 eisn;
470
471         sb = kvmppc_xive_find_source(xive, irq, &src);
472         if (!sb)
473                 return -ENOENT;
474
475         state = &sb->irq_state[src];
476
477         if (!state->valid)
478                 return -EINVAL;
479
480         if (get_user(kvm_cfg, ubufp))
481                 return -EFAULT;
482
483         pr_devel("%s irq=0x%lx cfg=%016llx\n", __func__, irq, kvm_cfg);
484
485         priority = (kvm_cfg & KVM_XIVE_SOURCE_PRIORITY_MASK) >>
486                 KVM_XIVE_SOURCE_PRIORITY_SHIFT;
487         server = (kvm_cfg & KVM_XIVE_SOURCE_SERVER_MASK) >>
488                 KVM_XIVE_SOURCE_SERVER_SHIFT;
489         masked = (kvm_cfg & KVM_XIVE_SOURCE_MASKED_MASK) >>
490                 KVM_XIVE_SOURCE_MASKED_SHIFT;
491         eisn = (kvm_cfg & KVM_XIVE_SOURCE_EISN_MASK) >>
492                 KVM_XIVE_SOURCE_EISN_SHIFT;
493
494         if (priority != xive_prio_from_guest(priority)) {
495                 pr_err("invalid priority for queue %d for VCPU %d\n",
496                        priority, server);
497                 return -EINVAL;
498         }
499
500         return kvmppc_xive_native_update_source_config(xive, sb, state, server,
501                                                        priority, masked, eisn);
502 }
503
504 static int kvmppc_xive_native_sync_source(struct kvmppc_xive *xive,
505                                           long irq, u64 addr)
506 {
507         struct kvmppc_xive_src_block *sb;
508         struct kvmppc_xive_irq_state *state;
509         struct xive_irq_data *xd;
510         u32 hw_num;
511         u16 src;
512         int rc = 0;
513
514         pr_devel("%s irq=0x%lx", __func__, irq);
515
516         sb = kvmppc_xive_find_source(xive, irq, &src);
517         if (!sb)
518                 return -ENOENT;
519
520         state = &sb->irq_state[src];
521
522         rc = -EINVAL;
523
524         arch_spin_lock(&sb->lock);
525
526         if (state->valid) {
527                 kvmppc_xive_select_irq(state, &hw_num, &xd);
528                 xive_native_sync_source(hw_num);
529                 rc = 0;
530         }
531
532         arch_spin_unlock(&sb->lock);
533         return rc;
534 }
535
536 static int xive_native_validate_queue_size(u32 qshift)
537 {
538         /*
539          * We only support 64K pages for the moment. This is also
540          * advertised in the DT property "ibm,xive-eq-sizes"
541          */
542         switch (qshift) {
543         case 0: /* EQ reset */
544         case 16:
545                 return 0;
546         case 12:
547         case 21:
548         case 24:
549         default:
550                 return -EINVAL;
551         }
552 }
553
554 static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
555                                                long eq_idx, u64 addr)
556 {
557         struct kvm *kvm = xive->kvm;
558         struct kvm_vcpu *vcpu;
559         struct kvmppc_xive_vcpu *xc;
560         void __user *ubufp = (void __user *) addr;
561         u32 server;
562         u8 priority;
563         struct kvm_ppc_xive_eq kvm_eq;
564         int rc;
565         __be32 *qaddr = 0;
566         struct page *page;
567         struct xive_q *q;
568         gfn_t gfn;
569         unsigned long page_size;
570         int srcu_idx;
571
572         /*
573          * Demangle priority/server tuple from the EQ identifier
574          */
575         priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
576                 KVM_XIVE_EQ_PRIORITY_SHIFT;
577         server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
578                 KVM_XIVE_EQ_SERVER_SHIFT;
579
580         if (copy_from_user(&kvm_eq, ubufp, sizeof(kvm_eq)))
581                 return -EFAULT;
582
583         vcpu = kvmppc_xive_find_server(kvm, server);
584         if (!vcpu) {
585                 pr_err("Can't find server %d\n", server);
586                 return -ENOENT;
587         }
588         xc = vcpu->arch.xive_vcpu;
589
590         if (priority != xive_prio_from_guest(priority)) {
591                 pr_err("Trying to restore invalid queue %d for VCPU %d\n",
592                        priority, server);
593                 return -EINVAL;
594         }
595         q = &xc->queues[priority];
596
597         pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
598                  __func__, server, priority, kvm_eq.flags,
599                  kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
600
601         /* reset queue and disable queueing */
602         if (!kvm_eq.qshift) {
603                 q->guest_qaddr  = 0;
604                 q->guest_qshift = 0;
605
606                 rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority,
607                                                         NULL, 0, true);
608                 if (rc) {
609                         pr_err("Failed to reset queue %d for VCPU %d: %d\n",
610                                priority, xc->server_num, rc);
611                         return rc;
612                 }
613
614                 return 0;
615         }
616
617         /*
618          * sPAPR specifies a "Unconditional Notify (n) flag" for the
619          * H_INT_SET_QUEUE_CONFIG hcall which forces notification
620          * without using the coalescing mechanisms provided by the
621          * XIVE END ESBs. This is required on KVM as notification
622          * using the END ESBs is not supported.
623          */
624         if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) {
625                 pr_err("invalid flags %d\n", kvm_eq.flags);
626                 return -EINVAL;
627         }
628
629         rc = xive_native_validate_queue_size(kvm_eq.qshift);
630         if (rc) {
631                 pr_err("invalid queue size %d\n", kvm_eq.qshift);
632                 return rc;
633         }
634
635         if (kvm_eq.qaddr & ((1ull << kvm_eq.qshift) - 1)) {
636                 pr_err("queue page is not aligned %llx/%llx\n", kvm_eq.qaddr,
637                        1ull << kvm_eq.qshift);
638                 return -EINVAL;
639         }
640
641         srcu_idx = srcu_read_lock(&kvm->srcu);
642         gfn = gpa_to_gfn(kvm_eq.qaddr);
643
644         page_size = kvm_host_page_size(vcpu, gfn);
645         if (1ull << kvm_eq.qshift > page_size) {
646                 srcu_read_unlock(&kvm->srcu, srcu_idx);
647                 pr_warn("Incompatible host page size %lx!\n", page_size);
648                 return -EINVAL;
649         }
650
651         page = gfn_to_page(kvm, gfn);
652         if (is_error_page(page)) {
653                 srcu_read_unlock(&kvm->srcu, srcu_idx);
654                 pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr);
655                 return -EINVAL;
656         }
657
658         qaddr = page_to_virt(page) + (kvm_eq.qaddr & ~PAGE_MASK);
659         srcu_read_unlock(&kvm->srcu, srcu_idx);
660
661         /*
662          * Backup the queue page guest address to the mark EQ page
663          * dirty for migration.
664          */
665         q->guest_qaddr  = kvm_eq.qaddr;
666         q->guest_qshift = kvm_eq.qshift;
667
668          /*
669           * Unconditional Notification is forced by default at the
670           * OPAL level because the use of END ESBs is not supported by
671           * Linux.
672           */
673         rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority,
674                                         (__be32 *) qaddr, kvm_eq.qshift, true);
675         if (rc) {
676                 pr_err("Failed to configure queue %d for VCPU %d: %d\n",
677                        priority, xc->server_num, rc);
678                 put_page(page);
679                 return rc;
680         }
681
682         /*
683          * Only restore the queue state when needed. When doing the
684          * H_INT_SET_SOURCE_CONFIG hcall, it should not.
685          */
686         if (kvm_eq.qtoggle != 1 || kvm_eq.qindex != 0) {
687                 rc = xive_native_set_queue_state(xc->vp_id, priority,
688                                                  kvm_eq.qtoggle,
689                                                  kvm_eq.qindex);
690                 if (rc)
691                         goto error;
692         }
693
694         rc = kvmppc_xive_attach_escalation(vcpu, priority,
695                                            xive->single_escalation);
696 error:
697         if (rc)
698                 kvmppc_xive_native_cleanup_queue(vcpu, priority);
699         return rc;
700 }
701
702 static int kvmppc_xive_native_get_queue_config(struct kvmppc_xive *xive,
703                                                long eq_idx, u64 addr)
704 {
705         struct kvm *kvm = xive->kvm;
706         struct kvm_vcpu *vcpu;
707         struct kvmppc_xive_vcpu *xc;
708         struct xive_q *q;
709         void __user *ubufp = (u64 __user *) addr;
710         u32 server;
711         u8 priority;
712         struct kvm_ppc_xive_eq kvm_eq;
713         u64 qaddr;
714         u64 qshift;
715         u64 qeoi_page;
716         u32 escalate_irq;
717         u64 qflags;
718         int rc;
719
720         /*
721          * Demangle priority/server tuple from the EQ identifier
722          */
723         priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
724                 KVM_XIVE_EQ_PRIORITY_SHIFT;
725         server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
726                 KVM_XIVE_EQ_SERVER_SHIFT;
727
728         vcpu = kvmppc_xive_find_server(kvm, server);
729         if (!vcpu) {
730                 pr_err("Can't find server %d\n", server);
731                 return -ENOENT;
732         }
733         xc = vcpu->arch.xive_vcpu;
734
735         if (priority != xive_prio_from_guest(priority)) {
736                 pr_err("invalid priority for queue %d for VCPU %d\n",
737                        priority, server);
738                 return -EINVAL;
739         }
740         q = &xc->queues[priority];
741
742         memset(&kvm_eq, 0, sizeof(kvm_eq));
743
744         if (!q->qpage)
745                 return 0;
746
747         rc = xive_native_get_queue_info(xc->vp_id, priority, &qaddr, &qshift,
748                                         &qeoi_page, &escalate_irq, &qflags);
749         if (rc)
750                 return rc;
751
752         kvm_eq.flags = 0;
753         if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY)
754                 kvm_eq.flags |= KVM_XIVE_EQ_ALWAYS_NOTIFY;
755
756         kvm_eq.qshift = q->guest_qshift;
757         kvm_eq.qaddr  = q->guest_qaddr;
758
759         rc = xive_native_get_queue_state(xc->vp_id, priority, &kvm_eq.qtoggle,
760                                          &kvm_eq.qindex);
761         if (rc)
762                 return rc;
763
764         pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
765                  __func__, server, priority, kvm_eq.flags,
766                  kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
767
768         if (copy_to_user(ubufp, &kvm_eq, sizeof(kvm_eq)))
769                 return -EFAULT;
770
771         return 0;
772 }
773
774 static void kvmppc_xive_reset_sources(struct kvmppc_xive_src_block *sb)
775 {
776         int i;
777
778         for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
779                 struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
780
781                 if (!state->valid)
782                         continue;
783
784                 if (state->act_priority == MASKED)
785                         continue;
786
787                 state->eisn = 0;
788                 state->act_server = 0;
789                 state->act_priority = MASKED;
790                 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
791                 xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
792                 if (state->pt_number) {
793                         xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
794                         xive_native_configure_irq(state->pt_number,
795                                                   0, MASKED, 0);
796                 }
797         }
798 }
799
800 static int kvmppc_xive_reset(struct kvmppc_xive *xive)
801 {
802         struct kvm *kvm = xive->kvm;
803         struct kvm_vcpu *vcpu;
804         unsigned int i;
805
806         pr_devel("%s\n", __func__);
807
808         mutex_lock(&xive->lock);
809
810         kvm_for_each_vcpu(i, vcpu, kvm) {
811                 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
812                 unsigned int prio;
813
814                 if (!xc)
815                         continue;
816
817                 kvmppc_xive_disable_vcpu_interrupts(vcpu);
818
819                 for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
820
821                         /* Single escalation, no queue 7 */
822                         if (prio == 7 && xive->single_escalation)
823                                 break;
824
825                         if (xc->esc_virq[prio]) {
826                                 free_irq(xc->esc_virq[prio], vcpu);
827                                 irq_dispose_mapping(xc->esc_virq[prio]);
828                                 kfree(xc->esc_virq_names[prio]);
829                                 xc->esc_virq[prio] = 0;
830                         }
831
832                         kvmppc_xive_native_cleanup_queue(vcpu, prio);
833                 }
834         }
835
836         for (i = 0; i <= xive->max_sbid; i++) {
837                 struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
838
839                 if (sb) {
840                         arch_spin_lock(&sb->lock);
841                         kvmppc_xive_reset_sources(sb);
842                         arch_spin_unlock(&sb->lock);
843                 }
844         }
845
846         mutex_unlock(&xive->lock);
847
848         return 0;
849 }
850
851 static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block *sb)
852 {
853         int j;
854
855         for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
856                 struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
857                 struct xive_irq_data *xd;
858                 u32 hw_num;
859
860                 if (!state->valid)
861                         continue;
862
863                 /*
864                  * The struct kvmppc_xive_irq_state reflects the state
865                  * of the EAS configuration and not the state of the
866                  * source. The source is masked setting the PQ bits to
867                  * '-Q', which is what is being done before calling
868                  * the KVM_DEV_XIVE_EQ_SYNC control.
869                  *
870                  * If a source EAS is configured, OPAL syncs the XIVE
871                  * IC of the source and the XIVE IC of the previous
872                  * target if any.
873                  *
874                  * So it should be fine ignoring MASKED sources as
875                  * they have been synced already.
876                  */
877                 if (state->act_priority == MASKED)
878                         continue;
879
880                 kvmppc_xive_select_irq(state, &hw_num, &xd);
881                 xive_native_sync_source(hw_num);
882                 xive_native_sync_queue(hw_num);
883         }
884 }
885
886 static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu)
887 {
888         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
889         unsigned int prio;
890         int srcu_idx;
891
892         if (!xc)
893                 return -ENOENT;
894
895         for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
896                 struct xive_q *q = &xc->queues[prio];
897
898                 if (!q->qpage)
899                         continue;
900
901                 /* Mark EQ page dirty for migration */
902                 srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
903                 mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qaddr));
904                 srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
905         }
906         return 0;
907 }
908
909 static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive)
910 {
911         struct kvm *kvm = xive->kvm;
912         struct kvm_vcpu *vcpu;
913         unsigned int i;
914
915         pr_devel("%s\n", __func__);
916
917         mutex_lock(&xive->lock);
918         for (i = 0; i <= xive->max_sbid; i++) {
919                 struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
920
921                 if (sb) {
922                         arch_spin_lock(&sb->lock);
923                         kvmppc_xive_native_sync_sources(sb);
924                         arch_spin_unlock(&sb->lock);
925                 }
926         }
927
928         kvm_for_each_vcpu(i, vcpu, kvm) {
929                 kvmppc_xive_native_vcpu_eq_sync(vcpu);
930         }
931         mutex_unlock(&xive->lock);
932
933         return 0;
934 }
935
936 static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
937                                        struct kvm_device_attr *attr)
938 {
939         struct kvmppc_xive *xive = dev->private;
940
941         switch (attr->group) {
942         case KVM_DEV_XIVE_GRP_CTRL:
943                 switch (attr->attr) {
944                 case KVM_DEV_XIVE_RESET:
945                         return kvmppc_xive_reset(xive);
946                 case KVM_DEV_XIVE_EQ_SYNC:
947                         return kvmppc_xive_native_eq_sync(xive);
948                 case KVM_DEV_XIVE_NR_SERVERS:
949                         return kvmppc_xive_set_nr_servers(xive, attr->addr);
950                 }
951                 break;
952         case KVM_DEV_XIVE_GRP_SOURCE:
953                 return kvmppc_xive_native_set_source(xive, attr->attr,
954                                                      attr->addr);
955         case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
956                 return kvmppc_xive_native_set_source_config(xive, attr->attr,
957                                                             attr->addr);
958         case KVM_DEV_XIVE_GRP_EQ_CONFIG:
959                 return kvmppc_xive_native_set_queue_config(xive, attr->attr,
960                                                            attr->addr);
961         case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
962                 return kvmppc_xive_native_sync_source(xive, attr->attr,
963                                                       attr->addr);
964         }
965         return -ENXIO;
966 }
967
968 static int kvmppc_xive_native_get_attr(struct kvm_device *dev,
969                                        struct kvm_device_attr *attr)
970 {
971         struct kvmppc_xive *xive = dev->private;
972
973         switch (attr->group) {
974         case KVM_DEV_XIVE_GRP_EQ_CONFIG:
975                 return kvmppc_xive_native_get_queue_config(xive, attr->attr,
976                                                            attr->addr);
977         }
978         return -ENXIO;
979 }
980
981 static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
982                                        struct kvm_device_attr *attr)
983 {
984         switch (attr->group) {
985         case KVM_DEV_XIVE_GRP_CTRL:
986                 switch (attr->attr) {
987                 case KVM_DEV_XIVE_RESET:
988                 case KVM_DEV_XIVE_EQ_SYNC:
989                 case KVM_DEV_XIVE_NR_SERVERS:
990                         return 0;
991                 }
992                 break;
993         case KVM_DEV_XIVE_GRP_SOURCE:
994         case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
995         case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
996                 if (attr->attr >= KVMPPC_XIVE_FIRST_IRQ &&
997                     attr->attr < KVMPPC_XIVE_NR_IRQS)
998                         return 0;
999                 break;
1000         case KVM_DEV_XIVE_GRP_EQ_CONFIG:
1001                 return 0;
1002         }
1003         return -ENXIO;
1004 }
1005
1006 /*
1007  * Called when device fd is closed.  kvm->lock is held.
1008  */
1009 static void kvmppc_xive_native_release(struct kvm_device *dev)
1010 {
1011         struct kvmppc_xive *xive = dev->private;
1012         struct kvm *kvm = xive->kvm;
1013         struct kvm_vcpu *vcpu;
1014         int i;
1015
1016         pr_devel("Releasing xive native device\n");
1017
1018         /*
1019          * Clear the KVM device file address_space which is used to
1020          * unmap the ESB pages when a device is passed-through.
1021          */
1022         mutex_lock(&xive->mapping_lock);
1023         xive->mapping = NULL;
1024         mutex_unlock(&xive->mapping_lock);
1025
1026         /*
1027          * Since this is the device release function, we know that
1028          * userspace does not have any open fd or mmap referring to
1029          * the device.  Therefore there can not be any of the
1030          * device attribute set/get, mmap, or page fault functions
1031          * being executed concurrently, and similarly, the
1032          * connect_vcpu and set/clr_mapped functions also cannot
1033          * be being executed.
1034          */
1035
1036         debugfs_remove(xive->dentry);
1037
1038         /*
1039          * We should clean up the vCPU interrupt presenters first.
1040          */
1041         kvm_for_each_vcpu(i, vcpu, kvm) {
1042                 /*
1043                  * Take vcpu->mutex to ensure that no one_reg get/set ioctl
1044                  * (i.e. kvmppc_xive_native_[gs]et_vp) can be being done.
1045                  * Holding the vcpu->mutex also means that the vcpu cannot
1046                  * be executing the KVM_RUN ioctl, and therefore it cannot
1047                  * be executing the XIVE push or pull code or accessing
1048                  * the XIVE MMIO regions.
1049                  */
1050                 mutex_lock(&vcpu->mutex);
1051                 kvmppc_xive_native_cleanup_vcpu(vcpu);
1052                 mutex_unlock(&vcpu->mutex);
1053         }
1054
1055         /*
1056          * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type
1057          * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe
1058          * against xive code getting called during vcpu execution or
1059          * set/get one_reg operations.
1060          */
1061         kvm->arch.xive = NULL;
1062
1063         for (i = 0; i <= xive->max_sbid; i++) {
1064                 if (xive->src_blocks[i])
1065                         kvmppc_xive_free_sources(xive->src_blocks[i]);
1066                 kfree(xive->src_blocks[i]);
1067                 xive->src_blocks[i] = NULL;
1068         }
1069
1070         if (xive->vp_base != XIVE_INVALID_VP)
1071                 xive_native_free_vp_block(xive->vp_base);
1072
1073         /*
1074          * A reference of the kvmppc_xive pointer is now kept under
1075          * the xive_devices struct of the machine for reuse. It is
1076          * freed when the VM is destroyed for now until we fix all the
1077          * execution paths.
1078          */
1079
1080         kfree(dev);
1081 }
1082
1083 /*
1084  * Create a XIVE device.  kvm->lock is held.
1085  */
1086 static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
1087 {
1088         struct kvmppc_xive *xive;
1089         struct kvm *kvm = dev->kvm;
1090
1091         pr_devel("Creating xive native device\n");
1092
1093         if (kvm->arch.xive)
1094                 return -EEXIST;
1095
1096         xive = kvmppc_xive_get_device(kvm, type);
1097         if (!xive)
1098                 return -ENOMEM;
1099
1100         dev->private = xive;
1101         xive->dev = dev;
1102         xive->kvm = kvm;
1103         mutex_init(&xive->mapping_lock);
1104         mutex_init(&xive->lock);
1105
1106         /* VP allocation is delayed to the first call to connect_vcpu */
1107         xive->vp_base = XIVE_INVALID_VP;
1108         /* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets
1109          * on a POWER9 system.
1110          */
1111         xive->nr_servers = KVM_MAX_VCPUS;
1112
1113         xive->single_escalation = xive_native_has_single_escalation();
1114         xive->ops = &kvmppc_xive_native_ops;
1115
1116         kvm->arch.xive = xive;
1117         return 0;
1118 }
1119
1120 /*
1121  * Interrupt Pending Buffer (IPB) offset
1122  */
1123 #define TM_IPB_SHIFT 40
1124 #define TM_IPB_MASK  (((u64) 0xFF) << TM_IPB_SHIFT)
1125
1126 int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1127 {
1128         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1129         u64 opal_state;
1130         int rc;
1131
1132         if (!kvmppc_xive_enabled(vcpu))
1133                 return -EPERM;
1134
1135         if (!xc)
1136                 return -ENOENT;
1137
1138         /* Thread context registers. We only care about IPB and CPPR */
1139         val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01;
1140
1141         /* Get the VP state from OPAL */
1142         rc = xive_native_get_vp_state(xc->vp_id, &opal_state);
1143         if (rc)
1144                 return rc;
1145
1146         /*
1147          * Capture the backup of IPB register in the NVT structure and
1148          * merge it in our KVM VP state.
1149          */
1150         val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK);
1151
1152         pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n",
1153                  __func__,
1154                  vcpu->arch.xive_saved_state.nsr,
1155                  vcpu->arch.xive_saved_state.cppr,
1156                  vcpu->arch.xive_saved_state.ipb,
1157                  vcpu->arch.xive_saved_state.pipr,
1158                  vcpu->arch.xive_saved_state.w01,
1159                  (u32) vcpu->arch.xive_cam_word, opal_state);
1160
1161         return 0;
1162 }
1163
1164 int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1165 {
1166         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1167         struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
1168
1169         pr_devel("%s w01=%016llx vp=%016llx\n", __func__,
1170                  val->xive_timaval[0], val->xive_timaval[1]);
1171
1172         if (!kvmppc_xive_enabled(vcpu))
1173                 return -EPERM;
1174
1175         if (!xc || !xive)
1176                 return -ENOENT;
1177
1178         /* We can't update the state of a "pushed" VCPU  */
1179         if (WARN_ON(vcpu->arch.xive_pushed))
1180                 return -EBUSY;
1181
1182         /*
1183          * Restore the thread context registers. IPB and CPPR should
1184          * be the only ones that matter.
1185          */
1186         vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0];
1187
1188         /*
1189          * There is no need to restore the XIVE internal state (IPB
1190          * stored in the NVT) as the IPB register was merged in KVM VP
1191          * state when captured.
1192          */
1193         return 0;
1194 }
1195
1196 bool kvmppc_xive_native_supported(void)
1197 {
1198         return xive_native_has_queue_state_support();
1199 }
1200
1201 static int xive_native_debug_show(struct seq_file *m, void *private)
1202 {
1203         struct kvmppc_xive *xive = m->private;
1204         struct kvm *kvm = xive->kvm;
1205         struct kvm_vcpu *vcpu;
1206         unsigned int i;
1207
1208         if (!kvm)
1209                 return 0;
1210
1211         seq_puts(m, "=========\nVCPU state\n=========\n");
1212
1213         kvm_for_each_vcpu(i, vcpu, kvm) {
1214                 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1215
1216                 if (!xc)
1217                         continue;
1218
1219                 seq_printf(m, "VCPU %d: VP=%#x/%02x\n"
1220                            "    NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n",
1221                            xc->server_num, xc->vp_id, xc->vp_chip_id,
1222                            vcpu->arch.xive_saved_state.nsr,
1223                            vcpu->arch.xive_saved_state.cppr,
1224                            vcpu->arch.xive_saved_state.ipb,
1225                            vcpu->arch.xive_saved_state.pipr,
1226                            be64_to_cpu(vcpu->arch.xive_saved_state.w01),
1227                            be32_to_cpu(vcpu->arch.xive_cam_word));
1228
1229                 kvmppc_xive_debug_show_queues(m, vcpu);
1230         }
1231
1232         seq_puts(m, "=========\nSources\n=========\n");
1233
1234         for (i = 0; i <= xive->max_sbid; i++) {
1235                 struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
1236
1237                 if (sb) {
1238                         arch_spin_lock(&sb->lock);
1239                         kvmppc_xive_debug_show_sources(m, sb);
1240                         arch_spin_unlock(&sb->lock);
1241                 }
1242         }
1243
1244         return 0;
1245 }
1246
1247 DEFINE_SHOW_ATTRIBUTE(xive_native_debug);
1248
1249 static void xive_native_debugfs_init(struct kvmppc_xive *xive)
1250 {
1251         char *name;
1252
1253         name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive);
1254         if (!name) {
1255                 pr_err("%s: no memory for name\n", __func__);
1256                 return;
1257         }
1258
1259         xive->dentry = debugfs_create_file(name, 0444, powerpc_debugfs_root,
1260                                            xive, &xive_native_debug_fops);
1261
1262         pr_debug("%s: created %s\n", __func__, name);
1263         kfree(name);
1264 }
1265
1266 static void kvmppc_xive_native_init(struct kvm_device *dev)
1267 {
1268         struct kvmppc_xive *xive = (struct kvmppc_xive *)dev->private;
1269
1270         /* Register some debug interfaces */
1271         xive_native_debugfs_init(xive);
1272 }
1273
1274 struct kvm_device_ops kvm_xive_native_ops = {
1275         .name = "kvm-xive-native",
1276         .create = kvmppc_xive_native_create,
1277         .init = kvmppc_xive_native_init,
1278         .release = kvmppc_xive_native_release,
1279         .set_attr = kvmppc_xive_native_set_attr,
1280         .get_attr = kvmppc_xive_native_get_attr,
1281         .has_attr = kvmppc_xive_native_has_attr,
1282         .mmap = kvmppc_xive_native_mmap,
1283 };