Merge tag 'seccomp-v5.14-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/kees...
[linux-2.6-microblaze.git] / arch / powerpc / kvm / book3s_xive_native.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2017-2019, IBM Corporation.
4  */
5
6 #define pr_fmt(fmt) "xive-kvm: " fmt
7
8 #include <linux/kernel.h>
9 #include <linux/kvm_host.h>
10 #include <linux/err.h>
11 #include <linux/gfp.h>
12 #include <linux/spinlock.h>
13 #include <linux/delay.h>
14 #include <linux/file.h>
15 #include <linux/irqdomain.h>
16 #include <asm/uaccess.h>
17 #include <asm/kvm_book3s.h>
18 #include <asm/kvm_ppc.h>
19 #include <asm/hvcall.h>
20 #include <asm/xive.h>
21 #include <asm/xive-regs.h>
22 #include <asm/debug.h>
23 #include <asm/debugfs.h>
24 #include <asm/opal.h>
25
26 #include <linux/debugfs.h>
27 #include <linux/seq_file.h>
28
29 #include "book3s_xive.h"
30
31 static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
32 {
33         u64 val;
34
35         /*
36          * The KVM XIVE native device does not use the XIVE_ESB_SET_PQ_10
37          * load operation, so there is no need to enforce load-after-store
38          * ordering.
39          */
40
41         val = in_be64(xd->eoi_mmio + offset);
42         return (u8)val;
43 }
44
45 static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio)
46 {
47         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
48         struct xive_q *q = &xc->queues[prio];
49
50         xive_native_disable_queue(xc->vp_id, q, prio);
51         if (q->qpage) {
52                 put_page(virt_to_page(q->qpage));
53                 q->qpage = NULL;
54         }
55 }
56
57 static int kvmppc_xive_native_configure_queue(u32 vp_id, struct xive_q *q,
58                                               u8 prio, __be32 *qpage,
59                                               u32 order, bool can_escalate)
60 {
61         int rc;
62         __be32 *qpage_prev = q->qpage;
63
64         rc = xive_native_configure_queue(vp_id, q, prio, qpage, order,
65                                          can_escalate);
66         if (rc)
67                 return rc;
68
69         if (qpage_prev)
70                 put_page(virt_to_page(qpage_prev));
71
72         return rc;
73 }
74
75 void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
76 {
77         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
78         int i;
79
80         if (!kvmppc_xive_enabled(vcpu))
81                 return;
82
83         if (!xc)
84                 return;
85
86         pr_devel("native_cleanup_vcpu(cpu=%d)\n", xc->server_num);
87
88         /* Ensure no interrupt is still routed to that VP */
89         xc->valid = false;
90         kvmppc_xive_disable_vcpu_interrupts(vcpu);
91
92         /* Free escalations */
93         for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
94                 /* Free the escalation irq */
95                 if (xc->esc_virq[i]) {
96                         if (xc->xive->single_escalation)
97                                 xive_cleanup_single_escalation(vcpu, xc,
98                                                         xc->esc_virq[i]);
99                         free_irq(xc->esc_virq[i], vcpu);
100                         irq_dispose_mapping(xc->esc_virq[i]);
101                         kfree(xc->esc_virq_names[i]);
102                         xc->esc_virq[i] = 0;
103                 }
104         }
105
106         /* Disable the VP */
107         xive_native_disable_vp(xc->vp_id);
108
109         /* Clear the cam word so guest entry won't try to push context */
110         vcpu->arch.xive_cam_word = 0;
111
112         /* Free the queues */
113         for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
114                 kvmppc_xive_native_cleanup_queue(vcpu, i);
115         }
116
117         /* Free the VP */
118         kfree(xc);
119
120         /* Cleanup the vcpu */
121         vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
122         vcpu->arch.xive_vcpu = NULL;
123 }
124
125 int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
126                                     struct kvm_vcpu *vcpu, u32 server_num)
127 {
128         struct kvmppc_xive *xive = dev->private;
129         struct kvmppc_xive_vcpu *xc = NULL;
130         int rc;
131         u32 vp_id;
132
133         pr_devel("native_connect_vcpu(server=%d)\n", server_num);
134
135         if (dev->ops != &kvm_xive_native_ops) {
136                 pr_devel("Wrong ops !\n");
137                 return -EPERM;
138         }
139         if (xive->kvm != vcpu->kvm)
140                 return -EPERM;
141         if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
142                 return -EBUSY;
143
144         mutex_lock(&xive->lock);
145
146         rc = kvmppc_xive_compute_vp_id(xive, server_num, &vp_id);
147         if (rc)
148                 goto bail;
149
150         xc = kzalloc(sizeof(*xc), GFP_KERNEL);
151         if (!xc) {
152                 rc = -ENOMEM;
153                 goto bail;
154         }
155
156         vcpu->arch.xive_vcpu = xc;
157         xc->xive = xive;
158         xc->vcpu = vcpu;
159         xc->server_num = server_num;
160
161         xc->vp_id = vp_id;
162         xc->valid = true;
163         vcpu->arch.irq_type = KVMPPC_IRQ_XIVE;
164
165         rc = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
166         if (rc) {
167                 pr_err("Failed to get VP info from OPAL: %d\n", rc);
168                 goto bail;
169         }
170
171         /*
172          * Enable the VP first as the single escalation mode will
173          * affect escalation interrupts numbering
174          */
175         rc = xive_native_enable_vp(xc->vp_id, xive->single_escalation);
176         if (rc) {
177                 pr_err("Failed to enable VP in OPAL: %d\n", rc);
178                 goto bail;
179         }
180
181         /* Configure VCPU fields for use by assembly push/pull */
182         vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
183         vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
184
185         /* TODO: reset all queues to a clean state ? */
186 bail:
187         mutex_unlock(&xive->lock);
188         if (rc)
189                 kvmppc_xive_native_cleanup_vcpu(vcpu);
190
191         return rc;
192 }
193
194 /*
195  * Device passthrough support
196  */
197 static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq)
198 {
199         struct kvmppc_xive *xive = kvm->arch.xive;
200         pgoff_t esb_pgoff = KVM_XIVE_ESB_PAGE_OFFSET + irq * 2;
201
202         if (irq >= KVMPPC_XIVE_NR_IRQS)
203                 return -EINVAL;
204
205         /*
206          * Clear the ESB pages of the IRQ number being mapped (or
207          * unmapped) into the guest and let the the VM fault handler
208          * repopulate with the appropriate ESB pages (device or IC)
209          */
210         pr_debug("clearing esb pages for girq 0x%lx\n", irq);
211         mutex_lock(&xive->mapping_lock);
212         if (xive->mapping)
213                 unmap_mapping_range(xive->mapping,
214                                     esb_pgoff << PAGE_SHIFT,
215                                     2ull << PAGE_SHIFT, 1);
216         mutex_unlock(&xive->mapping_lock);
217         return 0;
218 }
219
220 static struct kvmppc_xive_ops kvmppc_xive_native_ops =  {
221         .reset_mapped = kvmppc_xive_native_reset_mapped,
222 };
223
224 static vm_fault_t xive_native_esb_fault(struct vm_fault *vmf)
225 {
226         struct vm_area_struct *vma = vmf->vma;
227         struct kvm_device *dev = vma->vm_file->private_data;
228         struct kvmppc_xive *xive = dev->private;
229         struct kvmppc_xive_src_block *sb;
230         struct kvmppc_xive_irq_state *state;
231         struct xive_irq_data *xd;
232         u32 hw_num;
233         u16 src;
234         u64 page;
235         unsigned long irq;
236         u64 page_offset;
237
238         /*
239          * Linux/KVM uses a two pages ESB setting, one for trigger and
240          * one for EOI
241          */
242         page_offset = vmf->pgoff - vma->vm_pgoff;
243         irq = page_offset / 2;
244
245         sb = kvmppc_xive_find_source(xive, irq, &src);
246         if (!sb) {
247                 pr_devel("%s: source %lx not found !\n", __func__, irq);
248                 return VM_FAULT_SIGBUS;
249         }
250
251         state = &sb->irq_state[src];
252
253         /* Some sanity checking */
254         if (!state->valid) {
255                 pr_devel("%s: source %lx invalid !\n", __func__, irq);
256                 return VM_FAULT_SIGBUS;
257         }
258
259         kvmppc_xive_select_irq(state, &hw_num, &xd);
260
261         arch_spin_lock(&sb->lock);
262
263         /*
264          * first/even page is for trigger
265          * second/odd page is for EOI and management.
266          */
267         page = page_offset % 2 ? xd->eoi_page : xd->trig_page;
268         arch_spin_unlock(&sb->lock);
269
270         if (WARN_ON(!page)) {
271                 pr_err("%s: accessing invalid ESB page for source %lx !\n",
272                        __func__, irq);
273                 return VM_FAULT_SIGBUS;
274         }
275
276         vmf_insert_pfn(vma, vmf->address, page >> PAGE_SHIFT);
277         return VM_FAULT_NOPAGE;
278 }
279
280 static const struct vm_operations_struct xive_native_esb_vmops = {
281         .fault = xive_native_esb_fault,
282 };
283
284 static vm_fault_t xive_native_tima_fault(struct vm_fault *vmf)
285 {
286         struct vm_area_struct *vma = vmf->vma;
287
288         switch (vmf->pgoff - vma->vm_pgoff) {
289         case 0: /* HW - forbid access */
290         case 1: /* HV - forbid access */
291                 return VM_FAULT_SIGBUS;
292         case 2: /* OS */
293                 vmf_insert_pfn(vma, vmf->address, xive_tima_os >> PAGE_SHIFT);
294                 return VM_FAULT_NOPAGE;
295         case 3: /* USER - TODO */
296         default:
297                 return VM_FAULT_SIGBUS;
298         }
299 }
300
301 static const struct vm_operations_struct xive_native_tima_vmops = {
302         .fault = xive_native_tima_fault,
303 };
304
305 static int kvmppc_xive_native_mmap(struct kvm_device *dev,
306                                    struct vm_area_struct *vma)
307 {
308         struct kvmppc_xive *xive = dev->private;
309
310         /* We only allow mappings at fixed offset for now */
311         if (vma->vm_pgoff == KVM_XIVE_TIMA_PAGE_OFFSET) {
312                 if (vma_pages(vma) > 4)
313                         return -EINVAL;
314                 vma->vm_ops = &xive_native_tima_vmops;
315         } else if (vma->vm_pgoff == KVM_XIVE_ESB_PAGE_OFFSET) {
316                 if (vma_pages(vma) > KVMPPC_XIVE_NR_IRQS * 2)
317                         return -EINVAL;
318                 vma->vm_ops = &xive_native_esb_vmops;
319         } else {
320                 return -EINVAL;
321         }
322
323         vma->vm_flags |= VM_IO | VM_PFNMAP;
324         vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
325
326         /*
327          * Grab the KVM device file address_space to be able to clear
328          * the ESB pages mapping when a device is passed-through into
329          * the guest.
330          */
331         xive->mapping = vma->vm_file->f_mapping;
332         return 0;
333 }
334
335 static int kvmppc_xive_native_set_source(struct kvmppc_xive *xive, long irq,
336                                          u64 addr)
337 {
338         struct kvmppc_xive_src_block *sb;
339         struct kvmppc_xive_irq_state *state;
340         u64 __user *ubufp = (u64 __user *) addr;
341         u64 val;
342         u16 idx;
343         int rc;
344
345         pr_devel("%s irq=0x%lx\n", __func__, irq);
346
347         if (irq < KVMPPC_XIVE_FIRST_IRQ || irq >= KVMPPC_XIVE_NR_IRQS)
348                 return -E2BIG;
349
350         sb = kvmppc_xive_find_source(xive, irq, &idx);
351         if (!sb) {
352                 pr_debug("No source, creating source block...\n");
353                 sb = kvmppc_xive_create_src_block(xive, irq);
354                 if (!sb) {
355                         pr_err("Failed to create block...\n");
356                         return -ENOMEM;
357                 }
358         }
359         state = &sb->irq_state[idx];
360
361         if (get_user(val, ubufp)) {
362                 pr_err("fault getting user info !\n");
363                 return -EFAULT;
364         }
365
366         arch_spin_lock(&sb->lock);
367
368         /*
369          * If the source doesn't already have an IPI, allocate
370          * one and get the corresponding data
371          */
372         if (!state->ipi_number) {
373                 state->ipi_number = xive_native_alloc_irq();
374                 if (state->ipi_number == 0) {
375                         pr_err("Failed to allocate IRQ !\n");
376                         rc = -ENXIO;
377                         goto unlock;
378                 }
379                 xive_native_populate_irq_data(state->ipi_number,
380                                               &state->ipi_data);
381                 pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__,
382                          state->ipi_number, irq);
383         }
384
385         /* Restore LSI state */
386         if (val & KVM_XIVE_LEVEL_SENSITIVE) {
387                 state->lsi = true;
388                 if (val & KVM_XIVE_LEVEL_ASSERTED)
389                         state->asserted = true;
390                 pr_devel("  LSI ! Asserted=%d\n", state->asserted);
391         }
392
393         /* Mask IRQ to start with */
394         state->act_server = 0;
395         state->act_priority = MASKED;
396         xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
397         xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
398
399         /* Increment the number of valid sources and mark this one valid */
400         if (!state->valid)
401                 xive->src_count++;
402         state->valid = true;
403
404         rc = 0;
405
406 unlock:
407         arch_spin_unlock(&sb->lock);
408
409         return rc;
410 }
411
412 static int kvmppc_xive_native_update_source_config(struct kvmppc_xive *xive,
413                                         struct kvmppc_xive_src_block *sb,
414                                         struct kvmppc_xive_irq_state *state,
415                                         u32 server, u8 priority, bool masked,
416                                         u32 eisn)
417 {
418         struct kvm *kvm = xive->kvm;
419         u32 hw_num;
420         int rc = 0;
421
422         arch_spin_lock(&sb->lock);
423
424         if (state->act_server == server && state->act_priority == priority &&
425             state->eisn == eisn)
426                 goto unlock;
427
428         pr_devel("new_act_prio=%d new_act_server=%d mask=%d act_server=%d act_prio=%d\n",
429                  priority, server, masked, state->act_server,
430                  state->act_priority);
431
432         kvmppc_xive_select_irq(state, &hw_num, NULL);
433
434         if (priority != MASKED && !masked) {
435                 rc = kvmppc_xive_select_target(kvm, &server, priority);
436                 if (rc)
437                         goto unlock;
438
439                 state->act_priority = priority;
440                 state->act_server = server;
441                 state->eisn = eisn;
442
443                 rc = xive_native_configure_irq(hw_num,
444                                                kvmppc_xive_vp(xive, server),
445                                                priority, eisn);
446         } else {
447                 state->act_priority = MASKED;
448                 state->act_server = 0;
449                 state->eisn = 0;
450
451                 rc = xive_native_configure_irq(hw_num, 0, MASKED, 0);
452         }
453
454 unlock:
455         arch_spin_unlock(&sb->lock);
456         return rc;
457 }
458
459 static int kvmppc_xive_native_set_source_config(struct kvmppc_xive *xive,
460                                                 long irq, u64 addr)
461 {
462         struct kvmppc_xive_src_block *sb;
463         struct kvmppc_xive_irq_state *state;
464         u64 __user *ubufp = (u64 __user *) addr;
465         u16 src;
466         u64 kvm_cfg;
467         u32 server;
468         u8 priority;
469         bool masked;
470         u32 eisn;
471
472         sb = kvmppc_xive_find_source(xive, irq, &src);
473         if (!sb)
474                 return -ENOENT;
475
476         state = &sb->irq_state[src];
477
478         if (!state->valid)
479                 return -EINVAL;
480
481         if (get_user(kvm_cfg, ubufp))
482                 return -EFAULT;
483
484         pr_devel("%s irq=0x%lx cfg=%016llx\n", __func__, irq, kvm_cfg);
485
486         priority = (kvm_cfg & KVM_XIVE_SOURCE_PRIORITY_MASK) >>
487                 KVM_XIVE_SOURCE_PRIORITY_SHIFT;
488         server = (kvm_cfg & KVM_XIVE_SOURCE_SERVER_MASK) >>
489                 KVM_XIVE_SOURCE_SERVER_SHIFT;
490         masked = (kvm_cfg & KVM_XIVE_SOURCE_MASKED_MASK) >>
491                 KVM_XIVE_SOURCE_MASKED_SHIFT;
492         eisn = (kvm_cfg & KVM_XIVE_SOURCE_EISN_MASK) >>
493                 KVM_XIVE_SOURCE_EISN_SHIFT;
494
495         if (priority != xive_prio_from_guest(priority)) {
496                 pr_err("invalid priority for queue %d for VCPU %d\n",
497                        priority, server);
498                 return -EINVAL;
499         }
500
501         return kvmppc_xive_native_update_source_config(xive, sb, state, server,
502                                                        priority, masked, eisn);
503 }
504
505 static int kvmppc_xive_native_sync_source(struct kvmppc_xive *xive,
506                                           long irq, u64 addr)
507 {
508         struct kvmppc_xive_src_block *sb;
509         struct kvmppc_xive_irq_state *state;
510         struct xive_irq_data *xd;
511         u32 hw_num;
512         u16 src;
513         int rc = 0;
514
515         pr_devel("%s irq=0x%lx", __func__, irq);
516
517         sb = kvmppc_xive_find_source(xive, irq, &src);
518         if (!sb)
519                 return -ENOENT;
520
521         state = &sb->irq_state[src];
522
523         rc = -EINVAL;
524
525         arch_spin_lock(&sb->lock);
526
527         if (state->valid) {
528                 kvmppc_xive_select_irq(state, &hw_num, &xd);
529                 xive_native_sync_source(hw_num);
530                 rc = 0;
531         }
532
533         arch_spin_unlock(&sb->lock);
534         return rc;
535 }
536
537 static int xive_native_validate_queue_size(u32 qshift)
538 {
539         /*
540          * We only support 64K pages for the moment. This is also
541          * advertised in the DT property "ibm,xive-eq-sizes"
542          */
543         switch (qshift) {
544         case 0: /* EQ reset */
545         case 16:
546                 return 0;
547         case 12:
548         case 21:
549         case 24:
550         default:
551                 return -EINVAL;
552         }
553 }
554
555 static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
556                                                long eq_idx, u64 addr)
557 {
558         struct kvm *kvm = xive->kvm;
559         struct kvm_vcpu *vcpu;
560         struct kvmppc_xive_vcpu *xc;
561         void __user *ubufp = (void __user *) addr;
562         u32 server;
563         u8 priority;
564         struct kvm_ppc_xive_eq kvm_eq;
565         int rc;
566         __be32 *qaddr = 0;
567         struct page *page;
568         struct xive_q *q;
569         gfn_t gfn;
570         unsigned long page_size;
571         int srcu_idx;
572
573         /*
574          * Demangle priority/server tuple from the EQ identifier
575          */
576         priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
577                 KVM_XIVE_EQ_PRIORITY_SHIFT;
578         server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
579                 KVM_XIVE_EQ_SERVER_SHIFT;
580
581         if (copy_from_user(&kvm_eq, ubufp, sizeof(kvm_eq)))
582                 return -EFAULT;
583
584         vcpu = kvmppc_xive_find_server(kvm, server);
585         if (!vcpu) {
586                 pr_err("Can't find server %d\n", server);
587                 return -ENOENT;
588         }
589         xc = vcpu->arch.xive_vcpu;
590
591         if (priority != xive_prio_from_guest(priority)) {
592                 pr_err("Trying to restore invalid queue %d for VCPU %d\n",
593                        priority, server);
594                 return -EINVAL;
595         }
596         q = &xc->queues[priority];
597
598         pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
599                  __func__, server, priority, kvm_eq.flags,
600                  kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
601
602         /* reset queue and disable queueing */
603         if (!kvm_eq.qshift) {
604                 q->guest_qaddr  = 0;
605                 q->guest_qshift = 0;
606
607                 rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority,
608                                                         NULL, 0, true);
609                 if (rc) {
610                         pr_err("Failed to reset queue %d for VCPU %d: %d\n",
611                                priority, xc->server_num, rc);
612                         return rc;
613                 }
614
615                 return 0;
616         }
617
618         /*
619          * sPAPR specifies a "Unconditional Notify (n) flag" for the
620          * H_INT_SET_QUEUE_CONFIG hcall which forces notification
621          * without using the coalescing mechanisms provided by the
622          * XIVE END ESBs. This is required on KVM as notification
623          * using the END ESBs is not supported.
624          */
625         if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) {
626                 pr_err("invalid flags %d\n", kvm_eq.flags);
627                 return -EINVAL;
628         }
629
630         rc = xive_native_validate_queue_size(kvm_eq.qshift);
631         if (rc) {
632                 pr_err("invalid queue size %d\n", kvm_eq.qshift);
633                 return rc;
634         }
635
636         if (kvm_eq.qaddr & ((1ull << kvm_eq.qshift) - 1)) {
637                 pr_err("queue page is not aligned %llx/%llx\n", kvm_eq.qaddr,
638                        1ull << kvm_eq.qshift);
639                 return -EINVAL;
640         }
641
642         srcu_idx = srcu_read_lock(&kvm->srcu);
643         gfn = gpa_to_gfn(kvm_eq.qaddr);
644
645         page_size = kvm_host_page_size(vcpu, gfn);
646         if (1ull << kvm_eq.qshift > page_size) {
647                 srcu_read_unlock(&kvm->srcu, srcu_idx);
648                 pr_warn("Incompatible host page size %lx!\n", page_size);
649                 return -EINVAL;
650         }
651
652         page = gfn_to_page(kvm, gfn);
653         if (is_error_page(page)) {
654                 srcu_read_unlock(&kvm->srcu, srcu_idx);
655                 pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr);
656                 return -EINVAL;
657         }
658
659         qaddr = page_to_virt(page) + (kvm_eq.qaddr & ~PAGE_MASK);
660         srcu_read_unlock(&kvm->srcu, srcu_idx);
661
662         /*
663          * Backup the queue page guest address to the mark EQ page
664          * dirty for migration.
665          */
666         q->guest_qaddr  = kvm_eq.qaddr;
667         q->guest_qshift = kvm_eq.qshift;
668
669          /*
670           * Unconditional Notification is forced by default at the
671           * OPAL level because the use of END ESBs is not supported by
672           * Linux.
673           */
674         rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority,
675                                         (__be32 *) qaddr, kvm_eq.qshift, true);
676         if (rc) {
677                 pr_err("Failed to configure queue %d for VCPU %d: %d\n",
678                        priority, xc->server_num, rc);
679                 put_page(page);
680                 return rc;
681         }
682
683         /*
684          * Only restore the queue state when needed. When doing the
685          * H_INT_SET_SOURCE_CONFIG hcall, it should not.
686          */
687         if (kvm_eq.qtoggle != 1 || kvm_eq.qindex != 0) {
688                 rc = xive_native_set_queue_state(xc->vp_id, priority,
689                                                  kvm_eq.qtoggle,
690                                                  kvm_eq.qindex);
691                 if (rc)
692                         goto error;
693         }
694
695         rc = kvmppc_xive_attach_escalation(vcpu, priority,
696                                            xive->single_escalation);
697 error:
698         if (rc)
699                 kvmppc_xive_native_cleanup_queue(vcpu, priority);
700         return rc;
701 }
702
703 static int kvmppc_xive_native_get_queue_config(struct kvmppc_xive *xive,
704                                                long eq_idx, u64 addr)
705 {
706         struct kvm *kvm = xive->kvm;
707         struct kvm_vcpu *vcpu;
708         struct kvmppc_xive_vcpu *xc;
709         struct xive_q *q;
710         void __user *ubufp = (u64 __user *) addr;
711         u32 server;
712         u8 priority;
713         struct kvm_ppc_xive_eq kvm_eq;
714         u64 qaddr;
715         u64 qshift;
716         u64 qeoi_page;
717         u32 escalate_irq;
718         u64 qflags;
719         int rc;
720
721         /*
722          * Demangle priority/server tuple from the EQ identifier
723          */
724         priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
725                 KVM_XIVE_EQ_PRIORITY_SHIFT;
726         server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
727                 KVM_XIVE_EQ_SERVER_SHIFT;
728
729         vcpu = kvmppc_xive_find_server(kvm, server);
730         if (!vcpu) {
731                 pr_err("Can't find server %d\n", server);
732                 return -ENOENT;
733         }
734         xc = vcpu->arch.xive_vcpu;
735
736         if (priority != xive_prio_from_guest(priority)) {
737                 pr_err("invalid priority for queue %d for VCPU %d\n",
738                        priority, server);
739                 return -EINVAL;
740         }
741         q = &xc->queues[priority];
742
743         memset(&kvm_eq, 0, sizeof(kvm_eq));
744
745         if (!q->qpage)
746                 return 0;
747
748         rc = xive_native_get_queue_info(xc->vp_id, priority, &qaddr, &qshift,
749                                         &qeoi_page, &escalate_irq, &qflags);
750         if (rc)
751                 return rc;
752
753         kvm_eq.flags = 0;
754         if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY)
755                 kvm_eq.flags |= KVM_XIVE_EQ_ALWAYS_NOTIFY;
756
757         kvm_eq.qshift = q->guest_qshift;
758         kvm_eq.qaddr  = q->guest_qaddr;
759
760         rc = xive_native_get_queue_state(xc->vp_id, priority, &kvm_eq.qtoggle,
761                                          &kvm_eq.qindex);
762         if (rc)
763                 return rc;
764
765         pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
766                  __func__, server, priority, kvm_eq.flags,
767                  kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
768
769         if (copy_to_user(ubufp, &kvm_eq, sizeof(kvm_eq)))
770                 return -EFAULT;
771
772         return 0;
773 }
774
775 static void kvmppc_xive_reset_sources(struct kvmppc_xive_src_block *sb)
776 {
777         int i;
778
779         for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
780                 struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
781
782                 if (!state->valid)
783                         continue;
784
785                 if (state->act_priority == MASKED)
786                         continue;
787
788                 state->eisn = 0;
789                 state->act_server = 0;
790                 state->act_priority = MASKED;
791                 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
792                 xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
793                 if (state->pt_number) {
794                         xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
795                         xive_native_configure_irq(state->pt_number,
796                                                   0, MASKED, 0);
797                 }
798         }
799 }
800
801 static int kvmppc_xive_reset(struct kvmppc_xive *xive)
802 {
803         struct kvm *kvm = xive->kvm;
804         struct kvm_vcpu *vcpu;
805         unsigned int i;
806
807         pr_devel("%s\n", __func__);
808
809         mutex_lock(&xive->lock);
810
811         kvm_for_each_vcpu(i, vcpu, kvm) {
812                 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
813                 unsigned int prio;
814
815                 if (!xc)
816                         continue;
817
818                 kvmppc_xive_disable_vcpu_interrupts(vcpu);
819
820                 for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
821
822                         /* Single escalation, no queue 7 */
823                         if (prio == 7 && xive->single_escalation)
824                                 break;
825
826                         if (xc->esc_virq[prio]) {
827                                 free_irq(xc->esc_virq[prio], vcpu);
828                                 irq_dispose_mapping(xc->esc_virq[prio]);
829                                 kfree(xc->esc_virq_names[prio]);
830                                 xc->esc_virq[prio] = 0;
831                         }
832
833                         kvmppc_xive_native_cleanup_queue(vcpu, prio);
834                 }
835         }
836
837         for (i = 0; i <= xive->max_sbid; i++) {
838                 struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
839
840                 if (sb) {
841                         arch_spin_lock(&sb->lock);
842                         kvmppc_xive_reset_sources(sb);
843                         arch_spin_unlock(&sb->lock);
844                 }
845         }
846
847         mutex_unlock(&xive->lock);
848
849         return 0;
850 }
851
852 static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block *sb)
853 {
854         int j;
855
856         for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
857                 struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
858                 struct xive_irq_data *xd;
859                 u32 hw_num;
860
861                 if (!state->valid)
862                         continue;
863
864                 /*
865                  * The struct kvmppc_xive_irq_state reflects the state
866                  * of the EAS configuration and not the state of the
867                  * source. The source is masked setting the PQ bits to
868                  * '-Q', which is what is being done before calling
869                  * the KVM_DEV_XIVE_EQ_SYNC control.
870                  *
871                  * If a source EAS is configured, OPAL syncs the XIVE
872                  * IC of the source and the XIVE IC of the previous
873                  * target if any.
874                  *
875                  * So it should be fine ignoring MASKED sources as
876                  * they have been synced already.
877                  */
878                 if (state->act_priority == MASKED)
879                         continue;
880
881                 kvmppc_xive_select_irq(state, &hw_num, &xd);
882                 xive_native_sync_source(hw_num);
883                 xive_native_sync_queue(hw_num);
884         }
885 }
886
887 static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu)
888 {
889         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
890         unsigned int prio;
891         int srcu_idx;
892
893         if (!xc)
894                 return -ENOENT;
895
896         for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
897                 struct xive_q *q = &xc->queues[prio];
898
899                 if (!q->qpage)
900                         continue;
901
902                 /* Mark EQ page dirty for migration */
903                 srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
904                 mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qaddr));
905                 srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
906         }
907         return 0;
908 }
909
910 static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive)
911 {
912         struct kvm *kvm = xive->kvm;
913         struct kvm_vcpu *vcpu;
914         unsigned int i;
915
916         pr_devel("%s\n", __func__);
917
918         mutex_lock(&xive->lock);
919         for (i = 0; i <= xive->max_sbid; i++) {
920                 struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
921
922                 if (sb) {
923                         arch_spin_lock(&sb->lock);
924                         kvmppc_xive_native_sync_sources(sb);
925                         arch_spin_unlock(&sb->lock);
926                 }
927         }
928
929         kvm_for_each_vcpu(i, vcpu, kvm) {
930                 kvmppc_xive_native_vcpu_eq_sync(vcpu);
931         }
932         mutex_unlock(&xive->lock);
933
934         return 0;
935 }
936
937 static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
938                                        struct kvm_device_attr *attr)
939 {
940         struct kvmppc_xive *xive = dev->private;
941
942         switch (attr->group) {
943         case KVM_DEV_XIVE_GRP_CTRL:
944                 switch (attr->attr) {
945                 case KVM_DEV_XIVE_RESET:
946                         return kvmppc_xive_reset(xive);
947                 case KVM_DEV_XIVE_EQ_SYNC:
948                         return kvmppc_xive_native_eq_sync(xive);
949                 case KVM_DEV_XIVE_NR_SERVERS:
950                         return kvmppc_xive_set_nr_servers(xive, attr->addr);
951                 }
952                 break;
953         case KVM_DEV_XIVE_GRP_SOURCE:
954                 return kvmppc_xive_native_set_source(xive, attr->attr,
955                                                      attr->addr);
956         case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
957                 return kvmppc_xive_native_set_source_config(xive, attr->attr,
958                                                             attr->addr);
959         case KVM_DEV_XIVE_GRP_EQ_CONFIG:
960                 return kvmppc_xive_native_set_queue_config(xive, attr->attr,
961                                                            attr->addr);
962         case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
963                 return kvmppc_xive_native_sync_source(xive, attr->attr,
964                                                       attr->addr);
965         }
966         return -ENXIO;
967 }
968
969 static int kvmppc_xive_native_get_attr(struct kvm_device *dev,
970                                        struct kvm_device_attr *attr)
971 {
972         struct kvmppc_xive *xive = dev->private;
973
974         switch (attr->group) {
975         case KVM_DEV_XIVE_GRP_EQ_CONFIG:
976                 return kvmppc_xive_native_get_queue_config(xive, attr->attr,
977                                                            attr->addr);
978         }
979         return -ENXIO;
980 }
981
982 static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
983                                        struct kvm_device_attr *attr)
984 {
985         switch (attr->group) {
986         case KVM_DEV_XIVE_GRP_CTRL:
987                 switch (attr->attr) {
988                 case KVM_DEV_XIVE_RESET:
989                 case KVM_DEV_XIVE_EQ_SYNC:
990                 case KVM_DEV_XIVE_NR_SERVERS:
991                         return 0;
992                 }
993                 break;
994         case KVM_DEV_XIVE_GRP_SOURCE:
995         case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
996         case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
997                 if (attr->attr >= KVMPPC_XIVE_FIRST_IRQ &&
998                     attr->attr < KVMPPC_XIVE_NR_IRQS)
999                         return 0;
1000                 break;
1001         case KVM_DEV_XIVE_GRP_EQ_CONFIG:
1002                 return 0;
1003         }
1004         return -ENXIO;
1005 }
1006
1007 /*
1008  * Called when device fd is closed.  kvm->lock is held.
1009  */
1010 static void kvmppc_xive_native_release(struct kvm_device *dev)
1011 {
1012         struct kvmppc_xive *xive = dev->private;
1013         struct kvm *kvm = xive->kvm;
1014         struct kvm_vcpu *vcpu;
1015         int i;
1016
1017         pr_devel("Releasing xive native device\n");
1018
1019         /*
1020          * Clear the KVM device file address_space which is used to
1021          * unmap the ESB pages when a device is passed-through.
1022          */
1023         mutex_lock(&xive->mapping_lock);
1024         xive->mapping = NULL;
1025         mutex_unlock(&xive->mapping_lock);
1026
1027         /*
1028          * Since this is the device release function, we know that
1029          * userspace does not have any open fd or mmap referring to
1030          * the device.  Therefore there can not be any of the
1031          * device attribute set/get, mmap, or page fault functions
1032          * being executed concurrently, and similarly, the
1033          * connect_vcpu and set/clr_mapped functions also cannot
1034          * be being executed.
1035          */
1036
1037         debugfs_remove(xive->dentry);
1038
1039         /*
1040          * We should clean up the vCPU interrupt presenters first.
1041          */
1042         kvm_for_each_vcpu(i, vcpu, kvm) {
1043                 /*
1044                  * Take vcpu->mutex to ensure that no one_reg get/set ioctl
1045                  * (i.e. kvmppc_xive_native_[gs]et_vp) can be being done.
1046                  * Holding the vcpu->mutex also means that the vcpu cannot
1047                  * be executing the KVM_RUN ioctl, and therefore it cannot
1048                  * be executing the XIVE push or pull code or accessing
1049                  * the XIVE MMIO regions.
1050                  */
1051                 mutex_lock(&vcpu->mutex);
1052                 kvmppc_xive_native_cleanup_vcpu(vcpu);
1053                 mutex_unlock(&vcpu->mutex);
1054         }
1055
1056         /*
1057          * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type
1058          * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe
1059          * against xive code getting called during vcpu execution or
1060          * set/get one_reg operations.
1061          */
1062         kvm->arch.xive = NULL;
1063
1064         for (i = 0; i <= xive->max_sbid; i++) {
1065                 if (xive->src_blocks[i])
1066                         kvmppc_xive_free_sources(xive->src_blocks[i]);
1067                 kfree(xive->src_blocks[i]);
1068                 xive->src_blocks[i] = NULL;
1069         }
1070
1071         if (xive->vp_base != XIVE_INVALID_VP)
1072                 xive_native_free_vp_block(xive->vp_base);
1073
1074         /*
1075          * A reference of the kvmppc_xive pointer is now kept under
1076          * the xive_devices struct of the machine for reuse. It is
1077          * freed when the VM is destroyed for now until we fix all the
1078          * execution paths.
1079          */
1080
1081         kfree(dev);
1082 }
1083
1084 /*
1085  * Create a XIVE device.  kvm->lock is held.
1086  */
1087 static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
1088 {
1089         struct kvmppc_xive *xive;
1090         struct kvm *kvm = dev->kvm;
1091
1092         pr_devel("Creating xive native device\n");
1093
1094         if (kvm->arch.xive)
1095                 return -EEXIST;
1096
1097         xive = kvmppc_xive_get_device(kvm, type);
1098         if (!xive)
1099                 return -ENOMEM;
1100
1101         dev->private = xive;
1102         xive->dev = dev;
1103         xive->kvm = kvm;
1104         mutex_init(&xive->mapping_lock);
1105         mutex_init(&xive->lock);
1106
1107         /* VP allocation is delayed to the first call to connect_vcpu */
1108         xive->vp_base = XIVE_INVALID_VP;
1109         /* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets
1110          * on a POWER9 system.
1111          */
1112         xive->nr_servers = KVM_MAX_VCPUS;
1113
1114         xive->single_escalation = xive_native_has_single_escalation();
1115         xive->ops = &kvmppc_xive_native_ops;
1116
1117         kvm->arch.xive = xive;
1118         return 0;
1119 }
1120
1121 /*
1122  * Interrupt Pending Buffer (IPB) offset
1123  */
1124 #define TM_IPB_SHIFT 40
1125 #define TM_IPB_MASK  (((u64) 0xFF) << TM_IPB_SHIFT)
1126
1127 int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1128 {
1129         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1130         u64 opal_state;
1131         int rc;
1132
1133         if (!kvmppc_xive_enabled(vcpu))
1134                 return -EPERM;
1135
1136         if (!xc)
1137                 return -ENOENT;
1138
1139         /* Thread context registers. We only care about IPB and CPPR */
1140         val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01;
1141
1142         /* Get the VP state from OPAL */
1143         rc = xive_native_get_vp_state(xc->vp_id, &opal_state);
1144         if (rc)
1145                 return rc;
1146
1147         /*
1148          * Capture the backup of IPB register in the NVT structure and
1149          * merge it in our KVM VP state.
1150          */
1151         val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK);
1152
1153         pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n",
1154                  __func__,
1155                  vcpu->arch.xive_saved_state.nsr,
1156                  vcpu->arch.xive_saved_state.cppr,
1157                  vcpu->arch.xive_saved_state.ipb,
1158                  vcpu->arch.xive_saved_state.pipr,
1159                  vcpu->arch.xive_saved_state.w01,
1160                  (u32) vcpu->arch.xive_cam_word, opal_state);
1161
1162         return 0;
1163 }
1164
1165 int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1166 {
1167         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1168         struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
1169
1170         pr_devel("%s w01=%016llx vp=%016llx\n", __func__,
1171                  val->xive_timaval[0], val->xive_timaval[1]);
1172
1173         if (!kvmppc_xive_enabled(vcpu))
1174                 return -EPERM;
1175
1176         if (!xc || !xive)
1177                 return -ENOENT;
1178
1179         /* We can't update the state of a "pushed" VCPU  */
1180         if (WARN_ON(vcpu->arch.xive_pushed))
1181                 return -EBUSY;
1182
1183         /*
1184          * Restore the thread context registers. IPB and CPPR should
1185          * be the only ones that matter.
1186          */
1187         vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0];
1188
1189         /*
1190          * There is no need to restore the XIVE internal state (IPB
1191          * stored in the NVT) as the IPB register was merged in KVM VP
1192          * state when captured.
1193          */
1194         return 0;
1195 }
1196
1197 bool kvmppc_xive_native_supported(void)
1198 {
1199         return xive_native_has_queue_state_support();
1200 }
1201
1202 static int xive_native_debug_show(struct seq_file *m, void *private)
1203 {
1204         struct kvmppc_xive *xive = m->private;
1205         struct kvm *kvm = xive->kvm;
1206         struct kvm_vcpu *vcpu;
1207         unsigned int i;
1208
1209         if (!kvm)
1210                 return 0;
1211
1212         seq_puts(m, "=========\nVCPU state\n=========\n");
1213
1214         kvm_for_each_vcpu(i, vcpu, kvm) {
1215                 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1216
1217                 if (!xc)
1218                         continue;
1219
1220                 seq_printf(m, "VCPU %d: VP=%#x/%02x\n"
1221                            "    NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n",
1222                            xc->server_num, xc->vp_id, xc->vp_chip_id,
1223                            vcpu->arch.xive_saved_state.nsr,
1224                            vcpu->arch.xive_saved_state.cppr,
1225                            vcpu->arch.xive_saved_state.ipb,
1226                            vcpu->arch.xive_saved_state.pipr,
1227                            be64_to_cpu(vcpu->arch.xive_saved_state.w01),
1228                            be32_to_cpu(vcpu->arch.xive_cam_word));
1229
1230                 kvmppc_xive_debug_show_queues(m, vcpu);
1231         }
1232
1233         seq_puts(m, "=========\nSources\n=========\n");
1234
1235         for (i = 0; i <= xive->max_sbid; i++) {
1236                 struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
1237
1238                 if (sb) {
1239                         arch_spin_lock(&sb->lock);
1240                         kvmppc_xive_debug_show_sources(m, sb);
1241                         arch_spin_unlock(&sb->lock);
1242                 }
1243         }
1244
1245         return 0;
1246 }
1247
1248 DEFINE_SHOW_ATTRIBUTE(xive_native_debug);
1249
1250 static void xive_native_debugfs_init(struct kvmppc_xive *xive)
1251 {
1252         char *name;
1253
1254         name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive);
1255         if (!name) {
1256                 pr_err("%s: no memory for name\n", __func__);
1257                 return;
1258         }
1259
1260         xive->dentry = debugfs_create_file(name, 0444, powerpc_debugfs_root,
1261                                            xive, &xive_native_debug_fops);
1262
1263         pr_debug("%s: created %s\n", __func__, name);
1264         kfree(name);
1265 }
1266
1267 static void kvmppc_xive_native_init(struct kvm_device *dev)
1268 {
1269         struct kvmppc_xive *xive = (struct kvmppc_xive *)dev->private;
1270
1271         /* Register some debug interfaces */
1272         xive_native_debugfs_init(xive);
1273 }
1274
1275 struct kvm_device_ops kvm_xive_native_ops = {
1276         .name = "kvm-xive-native",
1277         .create = kvmppc_xive_native_create,
1278         .init = kvmppc_xive_native_init,
1279         .release = kvmppc_xive_native_release,
1280         .set_attr = kvmppc_xive_native_set_attr,
1281         .get_attr = kvmppc_xive_native_get_attr,
1282         .has_attr = kvmppc_xive_native_has_attr,
1283         .mmap = kvmppc_xive_native_mmap,
1284 };