KVM: PPC: Fix nested guest RC bits update
[linux-2.6-microblaze.git] / arch / powerpc / kvm / book3s_xive_native.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2017-2019, IBM Corporation.
4  */
5
6 #define pr_fmt(fmt) "xive-kvm: " fmt
7
8 #include <linux/kernel.h>
9 #include <linux/kvm_host.h>
10 #include <linux/err.h>
11 #include <linux/gfp.h>
12 #include <linux/spinlock.h>
13 #include <linux/delay.h>
14 #include <linux/file.h>
15 #include <asm/uaccess.h>
16 #include <asm/kvm_book3s.h>
17 #include <asm/kvm_ppc.h>
18 #include <asm/hvcall.h>
19 #include <asm/xive.h>
20 #include <asm/xive-regs.h>
21 #include <asm/debug.h>
22 #include <asm/debugfs.h>
23 #include <asm/opal.h>
24
25 #include <linux/debugfs.h>
26 #include <linux/seq_file.h>
27
28 #include "book3s_xive.h"
29
30 static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
31 {
32         u64 val;
33
34         /*
35          * The KVM XIVE native device does not use the XIVE_ESB_SET_PQ_10
36          * load operation, so there is no need to enforce load-after-store
37          * ordering.
38          */
39
40         if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
41                 offset |= offset << 4;
42
43         val = in_be64(xd->eoi_mmio + offset);
44         return (u8)val;
45 }
46
47 static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio)
48 {
49         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
50         struct xive_q *q = &xc->queues[prio];
51
52         xive_native_disable_queue(xc->vp_id, q, prio);
53         if (q->qpage) {
54                 put_page(virt_to_page(q->qpage));
55                 q->qpage = NULL;
56         }
57 }
58
59 static int kvmppc_xive_native_configure_queue(u32 vp_id, struct xive_q *q,
60                                               u8 prio, __be32 *qpage,
61                                               u32 order, bool can_escalate)
62 {
63         int rc;
64         __be32 *qpage_prev = q->qpage;
65
66         rc = xive_native_configure_queue(vp_id, q, prio, qpage, order,
67                                          can_escalate);
68         if (rc)
69                 return rc;
70
71         if (qpage_prev)
72                 put_page(virt_to_page(qpage_prev));
73
74         return rc;
75 }
76
77 void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
78 {
79         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
80         int i;
81
82         if (!kvmppc_xive_enabled(vcpu))
83                 return;
84
85         if (!xc)
86                 return;
87
88         pr_devel("native_cleanup_vcpu(cpu=%d)\n", xc->server_num);
89
90         /* Ensure no interrupt is still routed to that VP */
91         xc->valid = false;
92         kvmppc_xive_disable_vcpu_interrupts(vcpu);
93
94         /* Free escalations */
95         for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
96                 /* Free the escalation irq */
97                 if (xc->esc_virq[i]) {
98                         if (xc->xive->single_escalation)
99                                 xive_cleanup_single_escalation(vcpu, xc,
100                                                         xc->esc_virq[i]);
101                         free_irq(xc->esc_virq[i], vcpu);
102                         irq_dispose_mapping(xc->esc_virq[i]);
103                         kfree(xc->esc_virq_names[i]);
104                         xc->esc_virq[i] = 0;
105                 }
106         }
107
108         /* Disable the VP */
109         xive_native_disable_vp(xc->vp_id);
110
111         /* Clear the cam word so guest entry won't try to push context */
112         vcpu->arch.xive_cam_word = 0;
113
114         /* Free the queues */
115         for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
116                 kvmppc_xive_native_cleanup_queue(vcpu, i);
117         }
118
119         /* Free the VP */
120         kfree(xc);
121
122         /* Cleanup the vcpu */
123         vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
124         vcpu->arch.xive_vcpu = NULL;
125 }
126
127 int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
128                                     struct kvm_vcpu *vcpu, u32 server_num)
129 {
130         struct kvmppc_xive *xive = dev->private;
131         struct kvmppc_xive_vcpu *xc = NULL;
132         int rc;
133         u32 vp_id;
134
135         pr_devel("native_connect_vcpu(server=%d)\n", server_num);
136
137         if (dev->ops != &kvm_xive_native_ops) {
138                 pr_devel("Wrong ops !\n");
139                 return -EPERM;
140         }
141         if (xive->kvm != vcpu->kvm)
142                 return -EPERM;
143         if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
144                 return -EBUSY;
145
146         mutex_lock(&xive->lock);
147
148         rc = kvmppc_xive_compute_vp_id(xive, server_num, &vp_id);
149         if (rc)
150                 goto bail;
151
152         xc = kzalloc(sizeof(*xc), GFP_KERNEL);
153         if (!xc) {
154                 rc = -ENOMEM;
155                 goto bail;
156         }
157
158         vcpu->arch.xive_vcpu = xc;
159         xc->xive = xive;
160         xc->vcpu = vcpu;
161         xc->server_num = server_num;
162
163         xc->vp_id = vp_id;
164         xc->valid = true;
165         vcpu->arch.irq_type = KVMPPC_IRQ_XIVE;
166
167         rc = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
168         if (rc) {
169                 pr_err("Failed to get VP info from OPAL: %d\n", rc);
170                 goto bail;
171         }
172
173         /*
174          * Enable the VP first as the single escalation mode will
175          * affect escalation interrupts numbering
176          */
177         rc = xive_native_enable_vp(xc->vp_id, xive->single_escalation);
178         if (rc) {
179                 pr_err("Failed to enable VP in OPAL: %d\n", rc);
180                 goto bail;
181         }
182
183         /* Configure VCPU fields for use by assembly push/pull */
184         vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
185         vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
186
187         /* TODO: reset all queues to a clean state ? */
188 bail:
189         mutex_unlock(&xive->lock);
190         if (rc)
191                 kvmppc_xive_native_cleanup_vcpu(vcpu);
192
193         return rc;
194 }
195
196 /*
197  * Device passthrough support
198  */
199 static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq)
200 {
201         struct kvmppc_xive *xive = kvm->arch.xive;
202         pgoff_t esb_pgoff = KVM_XIVE_ESB_PAGE_OFFSET + irq * 2;
203
204         if (irq >= KVMPPC_XIVE_NR_IRQS)
205                 return -EINVAL;
206
207         /*
208          * Clear the ESB pages of the IRQ number being mapped (or
209          * unmapped) into the guest and let the the VM fault handler
210          * repopulate with the appropriate ESB pages (device or IC)
211          */
212         pr_debug("clearing esb pages for girq 0x%lx\n", irq);
213         mutex_lock(&xive->mapping_lock);
214         if (xive->mapping)
215                 unmap_mapping_range(xive->mapping,
216                                     esb_pgoff << PAGE_SHIFT,
217                                     2ull << PAGE_SHIFT, 1);
218         mutex_unlock(&xive->mapping_lock);
219         return 0;
220 }
221
222 static struct kvmppc_xive_ops kvmppc_xive_native_ops =  {
223         .reset_mapped = kvmppc_xive_native_reset_mapped,
224 };
225
226 static vm_fault_t xive_native_esb_fault(struct vm_fault *vmf)
227 {
228         struct vm_area_struct *vma = vmf->vma;
229         struct kvm_device *dev = vma->vm_file->private_data;
230         struct kvmppc_xive *xive = dev->private;
231         struct kvmppc_xive_src_block *sb;
232         struct kvmppc_xive_irq_state *state;
233         struct xive_irq_data *xd;
234         u32 hw_num;
235         u16 src;
236         u64 page;
237         unsigned long irq;
238         u64 page_offset;
239
240         /*
241          * Linux/KVM uses a two pages ESB setting, one for trigger and
242          * one for EOI
243          */
244         page_offset = vmf->pgoff - vma->vm_pgoff;
245         irq = page_offset / 2;
246
247         sb = kvmppc_xive_find_source(xive, irq, &src);
248         if (!sb) {
249                 pr_devel("%s: source %lx not found !\n", __func__, irq);
250                 return VM_FAULT_SIGBUS;
251         }
252
253         state = &sb->irq_state[src];
254         kvmppc_xive_select_irq(state, &hw_num, &xd);
255
256         arch_spin_lock(&sb->lock);
257
258         /*
259          * first/even page is for trigger
260          * second/odd page is for EOI and management.
261          */
262         page = page_offset % 2 ? xd->eoi_page : xd->trig_page;
263         arch_spin_unlock(&sb->lock);
264
265         if (WARN_ON(!page)) {
266                 pr_err("%s: accessing invalid ESB page for source %lx !\n",
267                        __func__, irq);
268                 return VM_FAULT_SIGBUS;
269         }
270
271         vmf_insert_pfn(vma, vmf->address, page >> PAGE_SHIFT);
272         return VM_FAULT_NOPAGE;
273 }
274
275 static const struct vm_operations_struct xive_native_esb_vmops = {
276         .fault = xive_native_esb_fault,
277 };
278
279 static vm_fault_t xive_native_tima_fault(struct vm_fault *vmf)
280 {
281         struct vm_area_struct *vma = vmf->vma;
282
283         switch (vmf->pgoff - vma->vm_pgoff) {
284         case 0: /* HW - forbid access */
285         case 1: /* HV - forbid access */
286                 return VM_FAULT_SIGBUS;
287         case 2: /* OS */
288                 vmf_insert_pfn(vma, vmf->address, xive_tima_os >> PAGE_SHIFT);
289                 return VM_FAULT_NOPAGE;
290         case 3: /* USER - TODO */
291         default:
292                 return VM_FAULT_SIGBUS;
293         }
294 }
295
296 static const struct vm_operations_struct xive_native_tima_vmops = {
297         .fault = xive_native_tima_fault,
298 };
299
300 static int kvmppc_xive_native_mmap(struct kvm_device *dev,
301                                    struct vm_area_struct *vma)
302 {
303         struct kvmppc_xive *xive = dev->private;
304
305         /* We only allow mappings at fixed offset for now */
306         if (vma->vm_pgoff == KVM_XIVE_TIMA_PAGE_OFFSET) {
307                 if (vma_pages(vma) > 4)
308                         return -EINVAL;
309                 vma->vm_ops = &xive_native_tima_vmops;
310         } else if (vma->vm_pgoff == KVM_XIVE_ESB_PAGE_OFFSET) {
311                 if (vma_pages(vma) > KVMPPC_XIVE_NR_IRQS * 2)
312                         return -EINVAL;
313                 vma->vm_ops = &xive_native_esb_vmops;
314         } else {
315                 return -EINVAL;
316         }
317
318         vma->vm_flags |= VM_IO | VM_PFNMAP;
319         vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
320
321         /*
322          * Grab the KVM device file address_space to be able to clear
323          * the ESB pages mapping when a device is passed-through into
324          * the guest.
325          */
326         xive->mapping = vma->vm_file->f_mapping;
327         return 0;
328 }
329
330 static int kvmppc_xive_native_set_source(struct kvmppc_xive *xive, long irq,
331                                          u64 addr)
332 {
333         struct kvmppc_xive_src_block *sb;
334         struct kvmppc_xive_irq_state *state;
335         u64 __user *ubufp = (u64 __user *) addr;
336         u64 val;
337         u16 idx;
338         int rc;
339
340         pr_devel("%s irq=0x%lx\n", __func__, irq);
341
342         if (irq < KVMPPC_XIVE_FIRST_IRQ || irq >= KVMPPC_XIVE_NR_IRQS)
343                 return -E2BIG;
344
345         sb = kvmppc_xive_find_source(xive, irq, &idx);
346         if (!sb) {
347                 pr_debug("No source, creating source block...\n");
348                 sb = kvmppc_xive_create_src_block(xive, irq);
349                 if (!sb) {
350                         pr_err("Failed to create block...\n");
351                         return -ENOMEM;
352                 }
353         }
354         state = &sb->irq_state[idx];
355
356         if (get_user(val, ubufp)) {
357                 pr_err("fault getting user info !\n");
358                 return -EFAULT;
359         }
360
361         arch_spin_lock(&sb->lock);
362
363         /*
364          * If the source doesn't already have an IPI, allocate
365          * one and get the corresponding data
366          */
367         if (!state->ipi_number) {
368                 state->ipi_number = xive_native_alloc_irq();
369                 if (state->ipi_number == 0) {
370                         pr_err("Failed to allocate IRQ !\n");
371                         rc = -ENXIO;
372                         goto unlock;
373                 }
374                 xive_native_populate_irq_data(state->ipi_number,
375                                               &state->ipi_data);
376                 pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__,
377                          state->ipi_number, irq);
378         }
379
380         /* Restore LSI state */
381         if (val & KVM_XIVE_LEVEL_SENSITIVE) {
382                 state->lsi = true;
383                 if (val & KVM_XIVE_LEVEL_ASSERTED)
384                         state->asserted = true;
385                 pr_devel("  LSI ! Asserted=%d\n", state->asserted);
386         }
387
388         /* Mask IRQ to start with */
389         state->act_server = 0;
390         state->act_priority = MASKED;
391         xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
392         xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
393
394         /* Increment the number of valid sources and mark this one valid */
395         if (!state->valid)
396                 xive->src_count++;
397         state->valid = true;
398
399         rc = 0;
400
401 unlock:
402         arch_spin_unlock(&sb->lock);
403
404         return rc;
405 }
406
407 static int kvmppc_xive_native_update_source_config(struct kvmppc_xive *xive,
408                                         struct kvmppc_xive_src_block *sb,
409                                         struct kvmppc_xive_irq_state *state,
410                                         u32 server, u8 priority, bool masked,
411                                         u32 eisn)
412 {
413         struct kvm *kvm = xive->kvm;
414         u32 hw_num;
415         int rc = 0;
416
417         arch_spin_lock(&sb->lock);
418
419         if (state->act_server == server && state->act_priority == priority &&
420             state->eisn == eisn)
421                 goto unlock;
422
423         pr_devel("new_act_prio=%d new_act_server=%d mask=%d act_server=%d act_prio=%d\n",
424                  priority, server, masked, state->act_server,
425                  state->act_priority);
426
427         kvmppc_xive_select_irq(state, &hw_num, NULL);
428
429         if (priority != MASKED && !masked) {
430                 rc = kvmppc_xive_select_target(kvm, &server, priority);
431                 if (rc)
432                         goto unlock;
433
434                 state->act_priority = priority;
435                 state->act_server = server;
436                 state->eisn = eisn;
437
438                 rc = xive_native_configure_irq(hw_num,
439                                                kvmppc_xive_vp(xive, server),
440                                                priority, eisn);
441         } else {
442                 state->act_priority = MASKED;
443                 state->act_server = 0;
444                 state->eisn = 0;
445
446                 rc = xive_native_configure_irq(hw_num, 0, MASKED, 0);
447         }
448
449 unlock:
450         arch_spin_unlock(&sb->lock);
451         return rc;
452 }
453
454 static int kvmppc_xive_native_set_source_config(struct kvmppc_xive *xive,
455                                                 long irq, u64 addr)
456 {
457         struct kvmppc_xive_src_block *sb;
458         struct kvmppc_xive_irq_state *state;
459         u64 __user *ubufp = (u64 __user *) addr;
460         u16 src;
461         u64 kvm_cfg;
462         u32 server;
463         u8 priority;
464         bool masked;
465         u32 eisn;
466
467         sb = kvmppc_xive_find_source(xive, irq, &src);
468         if (!sb)
469                 return -ENOENT;
470
471         state = &sb->irq_state[src];
472
473         if (!state->valid)
474                 return -EINVAL;
475
476         if (get_user(kvm_cfg, ubufp))
477                 return -EFAULT;
478
479         pr_devel("%s irq=0x%lx cfg=%016llx\n", __func__, irq, kvm_cfg);
480
481         priority = (kvm_cfg & KVM_XIVE_SOURCE_PRIORITY_MASK) >>
482                 KVM_XIVE_SOURCE_PRIORITY_SHIFT;
483         server = (kvm_cfg & KVM_XIVE_SOURCE_SERVER_MASK) >>
484                 KVM_XIVE_SOURCE_SERVER_SHIFT;
485         masked = (kvm_cfg & KVM_XIVE_SOURCE_MASKED_MASK) >>
486                 KVM_XIVE_SOURCE_MASKED_SHIFT;
487         eisn = (kvm_cfg & KVM_XIVE_SOURCE_EISN_MASK) >>
488                 KVM_XIVE_SOURCE_EISN_SHIFT;
489
490         if (priority != xive_prio_from_guest(priority)) {
491                 pr_err("invalid priority for queue %d for VCPU %d\n",
492                        priority, server);
493                 return -EINVAL;
494         }
495
496         return kvmppc_xive_native_update_source_config(xive, sb, state, server,
497                                                        priority, masked, eisn);
498 }
499
500 static int kvmppc_xive_native_sync_source(struct kvmppc_xive *xive,
501                                           long irq, u64 addr)
502 {
503         struct kvmppc_xive_src_block *sb;
504         struct kvmppc_xive_irq_state *state;
505         struct xive_irq_data *xd;
506         u32 hw_num;
507         u16 src;
508         int rc = 0;
509
510         pr_devel("%s irq=0x%lx", __func__, irq);
511
512         sb = kvmppc_xive_find_source(xive, irq, &src);
513         if (!sb)
514                 return -ENOENT;
515
516         state = &sb->irq_state[src];
517
518         rc = -EINVAL;
519
520         arch_spin_lock(&sb->lock);
521
522         if (state->valid) {
523                 kvmppc_xive_select_irq(state, &hw_num, &xd);
524                 xive_native_sync_source(hw_num);
525                 rc = 0;
526         }
527
528         arch_spin_unlock(&sb->lock);
529         return rc;
530 }
531
532 static int xive_native_validate_queue_size(u32 qshift)
533 {
534         /*
535          * We only support 64K pages for the moment. This is also
536          * advertised in the DT property "ibm,xive-eq-sizes"
537          */
538         switch (qshift) {
539         case 0: /* EQ reset */
540         case 16:
541                 return 0;
542         case 12:
543         case 21:
544         case 24:
545         default:
546                 return -EINVAL;
547         }
548 }
549
550 static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
551                                                long eq_idx, u64 addr)
552 {
553         struct kvm *kvm = xive->kvm;
554         struct kvm_vcpu *vcpu;
555         struct kvmppc_xive_vcpu *xc;
556         void __user *ubufp = (void __user *) addr;
557         u32 server;
558         u8 priority;
559         struct kvm_ppc_xive_eq kvm_eq;
560         int rc;
561         __be32 *qaddr = 0;
562         struct page *page;
563         struct xive_q *q;
564         gfn_t gfn;
565         unsigned long page_size;
566         int srcu_idx;
567
568         /*
569          * Demangle priority/server tuple from the EQ identifier
570          */
571         priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
572                 KVM_XIVE_EQ_PRIORITY_SHIFT;
573         server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
574                 KVM_XIVE_EQ_SERVER_SHIFT;
575
576         if (copy_from_user(&kvm_eq, ubufp, sizeof(kvm_eq)))
577                 return -EFAULT;
578
579         vcpu = kvmppc_xive_find_server(kvm, server);
580         if (!vcpu) {
581                 pr_err("Can't find server %d\n", server);
582                 return -ENOENT;
583         }
584         xc = vcpu->arch.xive_vcpu;
585
586         if (priority != xive_prio_from_guest(priority)) {
587                 pr_err("Trying to restore invalid queue %d for VCPU %d\n",
588                        priority, server);
589                 return -EINVAL;
590         }
591         q = &xc->queues[priority];
592
593         pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
594                  __func__, server, priority, kvm_eq.flags,
595                  kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
596
597         /* reset queue and disable queueing */
598         if (!kvm_eq.qshift) {
599                 q->guest_qaddr  = 0;
600                 q->guest_qshift = 0;
601
602                 rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority,
603                                                         NULL, 0, true);
604                 if (rc) {
605                         pr_err("Failed to reset queue %d for VCPU %d: %d\n",
606                                priority, xc->server_num, rc);
607                         return rc;
608                 }
609
610                 return 0;
611         }
612
613         /*
614          * sPAPR specifies a "Unconditional Notify (n) flag" for the
615          * H_INT_SET_QUEUE_CONFIG hcall which forces notification
616          * without using the coalescing mechanisms provided by the
617          * XIVE END ESBs. This is required on KVM as notification
618          * using the END ESBs is not supported.
619          */
620         if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) {
621                 pr_err("invalid flags %d\n", kvm_eq.flags);
622                 return -EINVAL;
623         }
624
625         rc = xive_native_validate_queue_size(kvm_eq.qshift);
626         if (rc) {
627                 pr_err("invalid queue size %d\n", kvm_eq.qshift);
628                 return rc;
629         }
630
631         if (kvm_eq.qaddr & ((1ull << kvm_eq.qshift) - 1)) {
632                 pr_err("queue page is not aligned %llx/%llx\n", kvm_eq.qaddr,
633                        1ull << kvm_eq.qshift);
634                 return -EINVAL;
635         }
636
637         srcu_idx = srcu_read_lock(&kvm->srcu);
638         gfn = gpa_to_gfn(kvm_eq.qaddr);
639
640         page_size = kvm_host_page_size(vcpu, gfn);
641         if (1ull << kvm_eq.qshift > page_size) {
642                 srcu_read_unlock(&kvm->srcu, srcu_idx);
643                 pr_warn("Incompatible host page size %lx!\n", page_size);
644                 return -EINVAL;
645         }
646
647         page = gfn_to_page(kvm, gfn);
648         if (is_error_page(page)) {
649                 srcu_read_unlock(&kvm->srcu, srcu_idx);
650                 pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr);
651                 return -EINVAL;
652         }
653
654         qaddr = page_to_virt(page) + (kvm_eq.qaddr & ~PAGE_MASK);
655         srcu_read_unlock(&kvm->srcu, srcu_idx);
656
657         /*
658          * Backup the queue page guest address to the mark EQ page
659          * dirty for migration.
660          */
661         q->guest_qaddr  = kvm_eq.qaddr;
662         q->guest_qshift = kvm_eq.qshift;
663
664          /*
665           * Unconditional Notification is forced by default at the
666           * OPAL level because the use of END ESBs is not supported by
667           * Linux.
668           */
669         rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority,
670                                         (__be32 *) qaddr, kvm_eq.qshift, true);
671         if (rc) {
672                 pr_err("Failed to configure queue %d for VCPU %d: %d\n",
673                        priority, xc->server_num, rc);
674                 put_page(page);
675                 return rc;
676         }
677
678         /*
679          * Only restore the queue state when needed. When doing the
680          * H_INT_SET_SOURCE_CONFIG hcall, it should not.
681          */
682         if (kvm_eq.qtoggle != 1 || kvm_eq.qindex != 0) {
683                 rc = xive_native_set_queue_state(xc->vp_id, priority,
684                                                  kvm_eq.qtoggle,
685                                                  kvm_eq.qindex);
686                 if (rc)
687                         goto error;
688         }
689
690         rc = kvmppc_xive_attach_escalation(vcpu, priority,
691                                            xive->single_escalation);
692 error:
693         if (rc)
694                 kvmppc_xive_native_cleanup_queue(vcpu, priority);
695         return rc;
696 }
697
698 static int kvmppc_xive_native_get_queue_config(struct kvmppc_xive *xive,
699                                                long eq_idx, u64 addr)
700 {
701         struct kvm *kvm = xive->kvm;
702         struct kvm_vcpu *vcpu;
703         struct kvmppc_xive_vcpu *xc;
704         struct xive_q *q;
705         void __user *ubufp = (u64 __user *) addr;
706         u32 server;
707         u8 priority;
708         struct kvm_ppc_xive_eq kvm_eq;
709         u64 qaddr;
710         u64 qshift;
711         u64 qeoi_page;
712         u32 escalate_irq;
713         u64 qflags;
714         int rc;
715
716         /*
717          * Demangle priority/server tuple from the EQ identifier
718          */
719         priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
720                 KVM_XIVE_EQ_PRIORITY_SHIFT;
721         server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
722                 KVM_XIVE_EQ_SERVER_SHIFT;
723
724         vcpu = kvmppc_xive_find_server(kvm, server);
725         if (!vcpu) {
726                 pr_err("Can't find server %d\n", server);
727                 return -ENOENT;
728         }
729         xc = vcpu->arch.xive_vcpu;
730
731         if (priority != xive_prio_from_guest(priority)) {
732                 pr_err("invalid priority for queue %d for VCPU %d\n",
733                        priority, server);
734                 return -EINVAL;
735         }
736         q = &xc->queues[priority];
737
738         memset(&kvm_eq, 0, sizeof(kvm_eq));
739
740         if (!q->qpage)
741                 return 0;
742
743         rc = xive_native_get_queue_info(xc->vp_id, priority, &qaddr, &qshift,
744                                         &qeoi_page, &escalate_irq, &qflags);
745         if (rc)
746                 return rc;
747
748         kvm_eq.flags = 0;
749         if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY)
750                 kvm_eq.flags |= KVM_XIVE_EQ_ALWAYS_NOTIFY;
751
752         kvm_eq.qshift = q->guest_qshift;
753         kvm_eq.qaddr  = q->guest_qaddr;
754
755         rc = xive_native_get_queue_state(xc->vp_id, priority, &kvm_eq.qtoggle,
756                                          &kvm_eq.qindex);
757         if (rc)
758                 return rc;
759
760         pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
761                  __func__, server, priority, kvm_eq.flags,
762                  kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
763
764         if (copy_to_user(ubufp, &kvm_eq, sizeof(kvm_eq)))
765                 return -EFAULT;
766
767         return 0;
768 }
769
770 static void kvmppc_xive_reset_sources(struct kvmppc_xive_src_block *sb)
771 {
772         int i;
773
774         for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
775                 struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
776
777                 if (!state->valid)
778                         continue;
779
780                 if (state->act_priority == MASKED)
781                         continue;
782
783                 state->eisn = 0;
784                 state->act_server = 0;
785                 state->act_priority = MASKED;
786                 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
787                 xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
788                 if (state->pt_number) {
789                         xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
790                         xive_native_configure_irq(state->pt_number,
791                                                   0, MASKED, 0);
792                 }
793         }
794 }
795
796 static int kvmppc_xive_reset(struct kvmppc_xive *xive)
797 {
798         struct kvm *kvm = xive->kvm;
799         struct kvm_vcpu *vcpu;
800         unsigned int i;
801
802         pr_devel("%s\n", __func__);
803
804         mutex_lock(&xive->lock);
805
806         kvm_for_each_vcpu(i, vcpu, kvm) {
807                 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
808                 unsigned int prio;
809
810                 if (!xc)
811                         continue;
812
813                 kvmppc_xive_disable_vcpu_interrupts(vcpu);
814
815                 for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
816
817                         /* Single escalation, no queue 7 */
818                         if (prio == 7 && xive->single_escalation)
819                                 break;
820
821                         if (xc->esc_virq[prio]) {
822                                 free_irq(xc->esc_virq[prio], vcpu);
823                                 irq_dispose_mapping(xc->esc_virq[prio]);
824                                 kfree(xc->esc_virq_names[prio]);
825                                 xc->esc_virq[prio] = 0;
826                         }
827
828                         kvmppc_xive_native_cleanup_queue(vcpu, prio);
829                 }
830         }
831
832         for (i = 0; i <= xive->max_sbid; i++) {
833                 struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
834
835                 if (sb) {
836                         arch_spin_lock(&sb->lock);
837                         kvmppc_xive_reset_sources(sb);
838                         arch_spin_unlock(&sb->lock);
839                 }
840         }
841
842         mutex_unlock(&xive->lock);
843
844         return 0;
845 }
846
847 static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block *sb)
848 {
849         int j;
850
851         for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
852                 struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
853                 struct xive_irq_data *xd;
854                 u32 hw_num;
855
856                 if (!state->valid)
857                         continue;
858
859                 /*
860                  * The struct kvmppc_xive_irq_state reflects the state
861                  * of the EAS configuration and not the state of the
862                  * source. The source is masked setting the PQ bits to
863                  * '-Q', which is what is being done before calling
864                  * the KVM_DEV_XIVE_EQ_SYNC control.
865                  *
866                  * If a source EAS is configured, OPAL syncs the XIVE
867                  * IC of the source and the XIVE IC of the previous
868                  * target if any.
869                  *
870                  * So it should be fine ignoring MASKED sources as
871                  * they have been synced already.
872                  */
873                 if (state->act_priority == MASKED)
874                         continue;
875
876                 kvmppc_xive_select_irq(state, &hw_num, &xd);
877                 xive_native_sync_source(hw_num);
878                 xive_native_sync_queue(hw_num);
879         }
880 }
881
882 static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu)
883 {
884         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
885         unsigned int prio;
886         int srcu_idx;
887
888         if (!xc)
889                 return -ENOENT;
890
891         for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
892                 struct xive_q *q = &xc->queues[prio];
893
894                 if (!q->qpage)
895                         continue;
896
897                 /* Mark EQ page dirty for migration */
898                 srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
899                 mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qaddr));
900                 srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
901         }
902         return 0;
903 }
904
905 static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive)
906 {
907         struct kvm *kvm = xive->kvm;
908         struct kvm_vcpu *vcpu;
909         unsigned int i;
910
911         pr_devel("%s\n", __func__);
912
913         mutex_lock(&xive->lock);
914         for (i = 0; i <= xive->max_sbid; i++) {
915                 struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
916
917                 if (sb) {
918                         arch_spin_lock(&sb->lock);
919                         kvmppc_xive_native_sync_sources(sb);
920                         arch_spin_unlock(&sb->lock);
921                 }
922         }
923
924         kvm_for_each_vcpu(i, vcpu, kvm) {
925                 kvmppc_xive_native_vcpu_eq_sync(vcpu);
926         }
927         mutex_unlock(&xive->lock);
928
929         return 0;
930 }
931
932 static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
933                                        struct kvm_device_attr *attr)
934 {
935         struct kvmppc_xive *xive = dev->private;
936
937         switch (attr->group) {
938         case KVM_DEV_XIVE_GRP_CTRL:
939                 switch (attr->attr) {
940                 case KVM_DEV_XIVE_RESET:
941                         return kvmppc_xive_reset(xive);
942                 case KVM_DEV_XIVE_EQ_SYNC:
943                         return kvmppc_xive_native_eq_sync(xive);
944                 case KVM_DEV_XIVE_NR_SERVERS:
945                         return kvmppc_xive_set_nr_servers(xive, attr->addr);
946                 }
947                 break;
948         case KVM_DEV_XIVE_GRP_SOURCE:
949                 return kvmppc_xive_native_set_source(xive, attr->attr,
950                                                      attr->addr);
951         case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
952                 return kvmppc_xive_native_set_source_config(xive, attr->attr,
953                                                             attr->addr);
954         case KVM_DEV_XIVE_GRP_EQ_CONFIG:
955                 return kvmppc_xive_native_set_queue_config(xive, attr->attr,
956                                                            attr->addr);
957         case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
958                 return kvmppc_xive_native_sync_source(xive, attr->attr,
959                                                       attr->addr);
960         }
961         return -ENXIO;
962 }
963
964 static int kvmppc_xive_native_get_attr(struct kvm_device *dev,
965                                        struct kvm_device_attr *attr)
966 {
967         struct kvmppc_xive *xive = dev->private;
968
969         switch (attr->group) {
970         case KVM_DEV_XIVE_GRP_EQ_CONFIG:
971                 return kvmppc_xive_native_get_queue_config(xive, attr->attr,
972                                                            attr->addr);
973         }
974         return -ENXIO;
975 }
976
977 static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
978                                        struct kvm_device_attr *attr)
979 {
980         switch (attr->group) {
981         case KVM_DEV_XIVE_GRP_CTRL:
982                 switch (attr->attr) {
983                 case KVM_DEV_XIVE_RESET:
984                 case KVM_DEV_XIVE_EQ_SYNC:
985                 case KVM_DEV_XIVE_NR_SERVERS:
986                         return 0;
987                 }
988                 break;
989         case KVM_DEV_XIVE_GRP_SOURCE:
990         case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
991         case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
992                 if (attr->attr >= KVMPPC_XIVE_FIRST_IRQ &&
993                     attr->attr < KVMPPC_XIVE_NR_IRQS)
994                         return 0;
995                 break;
996         case KVM_DEV_XIVE_GRP_EQ_CONFIG:
997                 return 0;
998         }
999         return -ENXIO;
1000 }
1001
1002 /*
1003  * Called when device fd is closed.  kvm->lock is held.
1004  */
1005 static void kvmppc_xive_native_release(struct kvm_device *dev)
1006 {
1007         struct kvmppc_xive *xive = dev->private;
1008         struct kvm *kvm = xive->kvm;
1009         struct kvm_vcpu *vcpu;
1010         int i;
1011
1012         pr_devel("Releasing xive native device\n");
1013
1014         /*
1015          * Clear the KVM device file address_space which is used to
1016          * unmap the ESB pages when a device is passed-through.
1017          */
1018         mutex_lock(&xive->mapping_lock);
1019         xive->mapping = NULL;
1020         mutex_unlock(&xive->mapping_lock);
1021
1022         /*
1023          * Since this is the device release function, we know that
1024          * userspace does not have any open fd or mmap referring to
1025          * the device.  Therefore there can not be any of the
1026          * device attribute set/get, mmap, or page fault functions
1027          * being executed concurrently, and similarly, the
1028          * connect_vcpu and set/clr_mapped functions also cannot
1029          * be being executed.
1030          */
1031
1032         debugfs_remove(xive->dentry);
1033
1034         /*
1035          * We should clean up the vCPU interrupt presenters first.
1036          */
1037         kvm_for_each_vcpu(i, vcpu, kvm) {
1038                 /*
1039                  * Take vcpu->mutex to ensure that no one_reg get/set ioctl
1040                  * (i.e. kvmppc_xive_native_[gs]et_vp) can be being done.
1041                  * Holding the vcpu->mutex also means that the vcpu cannot
1042                  * be executing the KVM_RUN ioctl, and therefore it cannot
1043                  * be executing the XIVE push or pull code or accessing
1044                  * the XIVE MMIO regions.
1045                  */
1046                 mutex_lock(&vcpu->mutex);
1047                 kvmppc_xive_native_cleanup_vcpu(vcpu);
1048                 mutex_unlock(&vcpu->mutex);
1049         }
1050
1051         /*
1052          * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type
1053          * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe
1054          * against xive code getting called during vcpu execution or
1055          * set/get one_reg operations.
1056          */
1057         kvm->arch.xive = NULL;
1058
1059         for (i = 0; i <= xive->max_sbid; i++) {
1060                 if (xive->src_blocks[i])
1061                         kvmppc_xive_free_sources(xive->src_blocks[i]);
1062                 kfree(xive->src_blocks[i]);
1063                 xive->src_blocks[i] = NULL;
1064         }
1065
1066         if (xive->vp_base != XIVE_INVALID_VP)
1067                 xive_native_free_vp_block(xive->vp_base);
1068
1069         /*
1070          * A reference of the kvmppc_xive pointer is now kept under
1071          * the xive_devices struct of the machine for reuse. It is
1072          * freed when the VM is destroyed for now until we fix all the
1073          * execution paths.
1074          */
1075
1076         kfree(dev);
1077 }
1078
1079 /*
1080  * Create a XIVE device.  kvm->lock is held.
1081  */
1082 static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
1083 {
1084         struct kvmppc_xive *xive;
1085         struct kvm *kvm = dev->kvm;
1086
1087         pr_devel("Creating xive native device\n");
1088
1089         if (kvm->arch.xive)
1090                 return -EEXIST;
1091
1092         xive = kvmppc_xive_get_device(kvm, type);
1093         if (!xive)
1094                 return -ENOMEM;
1095
1096         dev->private = xive;
1097         xive->dev = dev;
1098         xive->kvm = kvm;
1099         mutex_init(&xive->mapping_lock);
1100         mutex_init(&xive->lock);
1101
1102         /* VP allocation is delayed to the first call to connect_vcpu */
1103         xive->vp_base = XIVE_INVALID_VP;
1104         /* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets
1105          * on a POWER9 system.
1106          */
1107         xive->nr_servers = KVM_MAX_VCPUS;
1108
1109         xive->single_escalation = xive_native_has_single_escalation();
1110         xive->ops = &kvmppc_xive_native_ops;
1111
1112         kvm->arch.xive = xive;
1113         return 0;
1114 }
1115
1116 /*
1117  * Interrupt Pending Buffer (IPB) offset
1118  */
1119 #define TM_IPB_SHIFT 40
1120 #define TM_IPB_MASK  (((u64) 0xFF) << TM_IPB_SHIFT)
1121
1122 int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1123 {
1124         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1125         u64 opal_state;
1126         int rc;
1127
1128         if (!kvmppc_xive_enabled(vcpu))
1129                 return -EPERM;
1130
1131         if (!xc)
1132                 return -ENOENT;
1133
1134         /* Thread context registers. We only care about IPB and CPPR */
1135         val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01;
1136
1137         /* Get the VP state from OPAL */
1138         rc = xive_native_get_vp_state(xc->vp_id, &opal_state);
1139         if (rc)
1140                 return rc;
1141
1142         /*
1143          * Capture the backup of IPB register in the NVT structure and
1144          * merge it in our KVM VP state.
1145          */
1146         val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK);
1147
1148         pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n",
1149                  __func__,
1150                  vcpu->arch.xive_saved_state.nsr,
1151                  vcpu->arch.xive_saved_state.cppr,
1152                  vcpu->arch.xive_saved_state.ipb,
1153                  vcpu->arch.xive_saved_state.pipr,
1154                  vcpu->arch.xive_saved_state.w01,
1155                  (u32) vcpu->arch.xive_cam_word, opal_state);
1156
1157         return 0;
1158 }
1159
1160 int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1161 {
1162         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1163         struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
1164
1165         pr_devel("%s w01=%016llx vp=%016llx\n", __func__,
1166                  val->xive_timaval[0], val->xive_timaval[1]);
1167
1168         if (!kvmppc_xive_enabled(vcpu))
1169                 return -EPERM;
1170
1171         if (!xc || !xive)
1172                 return -ENOENT;
1173
1174         /* We can't update the state of a "pushed" VCPU  */
1175         if (WARN_ON(vcpu->arch.xive_pushed))
1176                 return -EBUSY;
1177
1178         /*
1179          * Restore the thread context registers. IPB and CPPR should
1180          * be the only ones that matter.
1181          */
1182         vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0];
1183
1184         /*
1185          * There is no need to restore the XIVE internal state (IPB
1186          * stored in the NVT) as the IPB register was merged in KVM VP
1187          * state when captured.
1188          */
1189         return 0;
1190 }
1191
1192 bool kvmppc_xive_native_supported(void)
1193 {
1194         return xive_native_has_queue_state_support();
1195 }
1196
1197 static int xive_native_debug_show(struct seq_file *m, void *private)
1198 {
1199         struct kvmppc_xive *xive = m->private;
1200         struct kvm *kvm = xive->kvm;
1201         struct kvm_vcpu *vcpu;
1202         unsigned int i;
1203
1204         if (!kvm)
1205                 return 0;
1206
1207         seq_puts(m, "=========\nVCPU state\n=========\n");
1208
1209         kvm_for_each_vcpu(i, vcpu, kvm) {
1210                 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1211
1212                 if (!xc)
1213                         continue;
1214
1215                 seq_printf(m, "cpu server %#x VP=%#x NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n",
1216                            xc->server_num, xc->vp_id,
1217                            vcpu->arch.xive_saved_state.nsr,
1218                            vcpu->arch.xive_saved_state.cppr,
1219                            vcpu->arch.xive_saved_state.ipb,
1220                            vcpu->arch.xive_saved_state.pipr,
1221                            vcpu->arch.xive_saved_state.w01,
1222                            (u32) vcpu->arch.xive_cam_word);
1223
1224                 kvmppc_xive_debug_show_queues(m, vcpu);
1225         }
1226
1227         return 0;
1228 }
1229
1230 static int xive_native_debug_open(struct inode *inode, struct file *file)
1231 {
1232         return single_open(file, xive_native_debug_show, inode->i_private);
1233 }
1234
1235 static const struct file_operations xive_native_debug_fops = {
1236         .open = xive_native_debug_open,
1237         .read = seq_read,
1238         .llseek = seq_lseek,
1239         .release = single_release,
1240 };
1241
1242 static void xive_native_debugfs_init(struct kvmppc_xive *xive)
1243 {
1244         char *name;
1245
1246         name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive);
1247         if (!name) {
1248                 pr_err("%s: no memory for name\n", __func__);
1249                 return;
1250         }
1251
1252         xive->dentry = debugfs_create_file(name, 0444, powerpc_debugfs_root,
1253                                            xive, &xive_native_debug_fops);
1254
1255         pr_debug("%s: created %s\n", __func__, name);
1256         kfree(name);
1257 }
1258
1259 static void kvmppc_xive_native_init(struct kvm_device *dev)
1260 {
1261         struct kvmppc_xive *xive = (struct kvmppc_xive *)dev->private;
1262
1263         /* Register some debug interfaces */
1264         xive_native_debugfs_init(xive);
1265 }
1266
1267 struct kvm_device_ops kvm_xive_native_ops = {
1268         .name = "kvm-xive-native",
1269         .create = kvmppc_xive_native_create,
1270         .init = kvmppc_xive_native_init,
1271         .release = kvmppc_xive_native_release,
1272         .set_attr = kvmppc_xive_native_set_attr,
1273         .get_attr = kvmppc_xive_native_get_attr,
1274         .has_attr = kvmppc_xive_native_has_attr,
1275         .mmap = kvmppc_xive_native_mmap,
1276 };
1277
1278 void kvmppc_xive_native_init_module(void)
1279 {
1280         ;
1281 }
1282
1283 void kvmppc_xive_native_exit_module(void)
1284 {
1285         ;
1286 }