KVM: SVM: Detect X2APIC virtualization (x2AVIC) support
[linux-2.6-microblaze.git] / arch / x86 / kvm / svm / avic.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Kernel-based Virtual Machine driver for Linux
4  *
5  * AMD SVM support
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9  *
10  * Authors:
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *   Avi Kivity   <avi@qumranet.com>
13  */
14
15 #define pr_fmt(fmt) "SVM: " fmt
16
17 #include <linux/kvm_types.h>
18 #include <linux/hashtable.h>
19 #include <linux/amd-iommu.h>
20 #include <linux/kvm_host.h>
21
22 #include <asm/irq_remapping.h>
23
24 #include "trace.h"
25 #include "lapic.h"
26 #include "x86.h"
27 #include "irq.h"
28 #include "svm.h"
29
30 /* AVIC GATAG is encoded using VM and VCPU IDs */
31 #define AVIC_VCPU_ID_BITS               8
32 #define AVIC_VCPU_ID_MASK               ((1 << AVIC_VCPU_ID_BITS) - 1)
33
34 #define AVIC_VM_ID_BITS                 24
35 #define AVIC_VM_ID_NR                   (1 << AVIC_VM_ID_BITS)
36 #define AVIC_VM_ID_MASK                 ((1 << AVIC_VM_ID_BITS) - 1)
37
38 #define AVIC_GATAG(x, y)                (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
39                                                 (y & AVIC_VCPU_ID_MASK))
40 #define AVIC_GATAG_TO_VMID(x)           ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
41 #define AVIC_GATAG_TO_VCPUID(x)         (x & AVIC_VCPU_ID_MASK)
42
43 static bool force_avic;
44 module_param_unsafe(force_avic, bool, 0444);
45
46 /* Note:
47  * This hash table is used to map VM_ID to a struct kvm_svm,
48  * when handling AMD IOMMU GALOG notification to schedule in
49  * a particular vCPU.
50  */
51 #define SVM_VM_DATA_HASH_BITS   8
52 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
53 static u32 next_vm_id = 0;
54 static bool next_vm_id_wrapped = 0;
55 static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
56 enum avic_modes avic_mode;
57
58 /*
59  * This is a wrapper of struct amd_iommu_ir_data.
60  */
61 struct amd_svm_iommu_ir {
62         struct list_head node;  /* Used by SVM for per-vcpu ir_list */
63         void *data;             /* Storing pointer to struct amd_ir_data */
64 };
65
66
67 /* Note:
68  * This function is called from IOMMU driver to notify
69  * SVM to schedule in a particular vCPU of a particular VM.
70  */
71 int avic_ga_log_notifier(u32 ga_tag)
72 {
73         unsigned long flags;
74         struct kvm_svm *kvm_svm;
75         struct kvm_vcpu *vcpu = NULL;
76         u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
77         u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
78
79         pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
80         trace_kvm_avic_ga_log(vm_id, vcpu_id);
81
82         spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
83         hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
84                 if (kvm_svm->avic_vm_id != vm_id)
85                         continue;
86                 vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
87                 break;
88         }
89         spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
90
91         /* Note:
92          * At this point, the IOMMU should have already set the pending
93          * bit in the vAPIC backing page. So, we just need to schedule
94          * in the vcpu.
95          */
96         if (vcpu)
97                 kvm_vcpu_wake_up(vcpu);
98
99         return 0;
100 }
101
102 void avic_vm_destroy(struct kvm *kvm)
103 {
104         unsigned long flags;
105         struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
106
107         if (!enable_apicv)
108                 return;
109
110         if (kvm_svm->avic_logical_id_table_page)
111                 __free_page(kvm_svm->avic_logical_id_table_page);
112         if (kvm_svm->avic_physical_id_table_page)
113                 __free_page(kvm_svm->avic_physical_id_table_page);
114
115         spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
116         hash_del(&kvm_svm->hnode);
117         spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
118 }
119
120 int avic_vm_init(struct kvm *kvm)
121 {
122         unsigned long flags;
123         int err = -ENOMEM;
124         struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
125         struct kvm_svm *k2;
126         struct page *p_page;
127         struct page *l_page;
128         u32 vm_id;
129
130         if (!enable_apicv)
131                 return 0;
132
133         /* Allocating physical APIC ID table (4KB) */
134         p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
135         if (!p_page)
136                 goto free_avic;
137
138         kvm_svm->avic_physical_id_table_page = p_page;
139
140         /* Allocating logical APIC ID table (4KB) */
141         l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
142         if (!l_page)
143                 goto free_avic;
144
145         kvm_svm->avic_logical_id_table_page = l_page;
146
147         spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
148  again:
149         vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
150         if (vm_id == 0) { /* id is 1-based, zero is not okay */
151                 next_vm_id_wrapped = 1;
152                 goto again;
153         }
154         /* Is it still in use? Only possible if wrapped at least once */
155         if (next_vm_id_wrapped) {
156                 hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
157                         if (k2->avic_vm_id == vm_id)
158                                 goto again;
159                 }
160         }
161         kvm_svm->avic_vm_id = vm_id;
162         hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
163         spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
164
165         return 0;
166
167 free_avic:
168         avic_vm_destroy(kvm);
169         return err;
170 }
171
172 void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
173 {
174         struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
175         phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
176         phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
177         phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
178
179         vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
180         vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
181         vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
182         vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
183         vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
184
185         if (kvm_apicv_activated(svm->vcpu.kvm))
186                 vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
187         else
188                 vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
189 }
190
191 static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
192                                        unsigned int index)
193 {
194         u64 *avic_physical_id_table;
195         struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
196
197         if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
198                 return NULL;
199
200         avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
201
202         return &avic_physical_id_table[index];
203 }
204
205 /*
206  * Note:
207  * AVIC hardware walks the nested page table to check permissions,
208  * but does not use the SPA address specified in the leaf page
209  * table entry since it uses  address in the AVIC_BACKING_PAGE pointer
210  * field of the VMCB. Therefore, we set up the
211  * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
212  */
213 static int avic_alloc_access_page(struct kvm *kvm)
214 {
215         void __user *ret;
216         int r = 0;
217
218         mutex_lock(&kvm->slots_lock);
219
220         if (kvm->arch.apic_access_memslot_enabled)
221                 goto out;
222
223         ret = __x86_set_memory_region(kvm,
224                                       APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
225                                       APIC_DEFAULT_PHYS_BASE,
226                                       PAGE_SIZE);
227         if (IS_ERR(ret)) {
228                 r = PTR_ERR(ret);
229                 goto out;
230         }
231
232         kvm->arch.apic_access_memslot_enabled = true;
233 out:
234         mutex_unlock(&kvm->slots_lock);
235         return r;
236 }
237
238 static int avic_init_backing_page(struct kvm_vcpu *vcpu)
239 {
240         u64 *entry, new_entry;
241         int id = vcpu->vcpu_id;
242         struct vcpu_svm *svm = to_svm(vcpu);
243
244         if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
245                 return -EINVAL;
246
247         if (!vcpu->arch.apic->regs)
248                 return -EINVAL;
249
250         if (kvm_apicv_activated(vcpu->kvm)) {
251                 int ret;
252
253                 ret = avic_alloc_access_page(vcpu->kvm);
254                 if (ret)
255                         return ret;
256         }
257
258         svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
259
260         /* Setting AVIC backing page address in the phy APIC ID table */
261         entry = avic_get_physical_id_entry(vcpu, id);
262         if (!entry)
263                 return -EINVAL;
264
265         new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
266                               AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
267                               AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
268         WRITE_ONCE(*entry, new_entry);
269
270         svm->avic_physical_id_cache = entry;
271
272         return 0;
273 }
274
275 void avic_ring_doorbell(struct kvm_vcpu *vcpu)
276 {
277         /*
278          * Note, the vCPU could get migrated to a different pCPU at any point,
279          * which could result in signalling the wrong/previous pCPU.  But if
280          * that happens the vCPU is guaranteed to do a VMRUN (after being
281          * migrated) and thus will process pending interrupts, i.e. a doorbell
282          * is not needed (and the spurious one is harmless).
283          */
284         int cpu = READ_ONCE(vcpu->cpu);
285
286         if (cpu != get_cpu())
287                 wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
288         put_cpu();
289 }
290
291 /*
292  * A fast-path version of avic_kick_target_vcpus(), which attempts to match
293  * destination APIC ID to vCPU without looping through all vCPUs.
294  */
295 static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
296                                        u32 icrl, u32 icrh, u32 index)
297 {
298         u32 l1_physical_id, dest;
299         struct kvm_vcpu *target_vcpu;
300         int dest_mode = icrl & APIC_DEST_MASK;
301         int shorthand = icrl & APIC_SHORT_MASK;
302         struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
303
304         if (shorthand != APIC_DEST_NOSHORT)
305                 return -EINVAL;
306
307         if (apic_x2apic_mode(source))
308                 dest = icrh;
309         else
310                 dest = GET_XAPIC_DEST_FIELD(icrh);
311
312         if (dest_mode == APIC_DEST_PHYSICAL) {
313                 /* broadcast destination, use slow path */
314                 if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST)
315                         return -EINVAL;
316                 if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
317                         return -EINVAL;
318
319                 l1_physical_id = dest;
320
321                 if (WARN_ON_ONCE(l1_physical_id != index))
322                         return -EINVAL;
323
324         } else {
325                 u32 bitmap, cluster;
326                 int logid_index;
327
328                 if (apic_x2apic_mode(source)) {
329                         /* 16 bit dest mask, 16 bit cluster id */
330                         bitmap = dest & 0xFFFF0000;
331                         cluster = (dest >> 16) << 4;
332                 } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
333                         /* 8 bit dest mask*/
334                         bitmap = dest;
335                         cluster = 0;
336                 } else {
337                         /* 4 bit desk mask, 4 bit cluster id */
338                         bitmap = dest & 0xF;
339                         cluster = (dest >> 4) << 2;
340                 }
341
342                 if (unlikely(!bitmap))
343                         /* guest bug: nobody to send the logical interrupt to */
344                         return 0;
345
346                 if (!is_power_of_2(bitmap))
347                         /* multiple logical destinations, use slow path */
348                         return -EINVAL;
349
350                 logid_index = cluster + __ffs(bitmap);
351
352                 if (apic_x2apic_mode(source)) {
353                         l1_physical_id = logid_index;
354                 } else {
355                         u32 *avic_logical_id_table =
356                                 page_address(kvm_svm->avic_logical_id_table_page);
357
358                         u32 logid_entry = avic_logical_id_table[logid_index];
359
360                         if (WARN_ON_ONCE(index != logid_index))
361                                 return -EINVAL;
362
363                         /* guest bug: non existing/reserved logical destination */
364                         if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
365                                 return 0;
366
367                         l1_physical_id = logid_entry &
368                                          AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
369                 }
370         }
371
372         target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id);
373         if (unlikely(!target_vcpu))
374                 /* guest bug: non existing vCPU is a target of this IPI*/
375                 return 0;
376
377         target_vcpu->arch.apic->irr_pending = true;
378         svm_complete_interrupt_delivery(target_vcpu,
379                                         icrl & APIC_MODE_MASK,
380                                         icrl & APIC_INT_LEVELTRIG,
381                                         icrl & APIC_VECTOR_MASK);
382         return 0;
383 }
384
385 static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
386                                    u32 icrl, u32 icrh, u32 index)
387 {
388         unsigned long i;
389         struct kvm_vcpu *vcpu;
390
391         if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index))
392                 return;
393
394         trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index);
395
396         /*
397          * Wake any target vCPUs that are blocking, i.e. waiting for a wake
398          * event.  There's no need to signal doorbells, as hardware has handled
399          * vCPUs that were in guest at the time of the IPI, and vCPUs that have
400          * since entered the guest will have processed pending IRQs at VMRUN.
401          */
402         kvm_for_each_vcpu(i, vcpu, kvm) {
403                 if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
404                                         GET_XAPIC_DEST_FIELD(icrh),
405                                         icrl & APIC_DEST_MASK)) {
406                         vcpu->arch.apic->irr_pending = true;
407                         svm_complete_interrupt_delivery(vcpu,
408                                                         icrl & APIC_MODE_MASK,
409                                                         icrl & APIC_INT_LEVELTRIG,
410                                                         icrl & APIC_VECTOR_MASK);
411                 }
412         }
413 }
414
415 int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
416 {
417         struct vcpu_svm *svm = to_svm(vcpu);
418         u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
419         u32 icrl = svm->vmcb->control.exit_info_1;
420         u32 id = svm->vmcb->control.exit_info_2 >> 32;
421         u32 index = svm->vmcb->control.exit_info_2 & 0x1FF;
422         struct kvm_lapic *apic = vcpu->arch.apic;
423
424         trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
425
426         switch (id) {
427         case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
428                 /*
429                  * Emulate IPIs that are not handled by AVIC hardware, which
430                  * only virtualizes Fixed, Edge-Triggered INTRs.  The exit is
431                  * a trap, e.g. ICR holds the correct value and RIP has been
432                  * advanced, KVM is responsible only for emulating the IPI.
433                  * Sadly, hardware may sometimes leave the BUSY flag set, in
434                  * which case KVM needs to emulate the ICR write as well in
435                  * order to clear the BUSY flag.
436                  */
437                 if (icrl & APIC_ICR_BUSY)
438                         kvm_apic_write_nodecode(vcpu, APIC_ICR);
439                 else
440                         kvm_apic_send_ipi(apic, icrl, icrh);
441                 break;
442         case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
443                 /*
444                  * At this point, we expect that the AVIC HW has already
445                  * set the appropriate IRR bits on the valid target
446                  * vcpus. So, we just need to kick the appropriate vcpu.
447                  */
448                 avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
449                 break;
450         case AVIC_IPI_FAILURE_INVALID_TARGET:
451                 break;
452         case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
453                 WARN_ONCE(1, "Invalid backing page\n");
454                 break;
455         default:
456                 pr_err("Unknown IPI interception\n");
457         }
458
459         return 1;
460 }
461
462 unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
463 {
464         if (is_guest_mode(vcpu))
465                 return APICV_INHIBIT_REASON_NESTED;
466         return 0;
467 }
468
469 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
470 {
471         struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
472         int index;
473         u32 *logical_apic_id_table;
474         int dlid = GET_APIC_LOGICAL_ID(ldr);
475
476         if (!dlid)
477                 return NULL;
478
479         if (flat) { /* flat */
480                 index = ffs(dlid) - 1;
481                 if (index > 7)
482                         return NULL;
483         } else { /* cluster */
484                 int cluster = (dlid & 0xf0) >> 4;
485                 int apic = ffs(dlid & 0x0f) - 1;
486
487                 if ((apic < 0) || (apic > 7) ||
488                     (cluster >= 0xf))
489                         return NULL;
490                 index = (cluster << 2) + apic;
491         }
492
493         logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
494
495         return &logical_apic_id_table[index];
496 }
497
498 static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
499 {
500         bool flat;
501         u32 *entry, new_entry;
502
503         flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
504         entry = avic_get_logical_id_entry(vcpu, ldr, flat);
505         if (!entry)
506                 return -EINVAL;
507
508         new_entry = READ_ONCE(*entry);
509         new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
510         new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
511         new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
512         WRITE_ONCE(*entry, new_entry);
513
514         return 0;
515 }
516
517 static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
518 {
519         struct vcpu_svm *svm = to_svm(vcpu);
520         bool flat = svm->dfr_reg == APIC_DFR_FLAT;
521         u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
522
523         if (entry)
524                 clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
525 }
526
527 static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
528 {
529         int ret = 0;
530         struct vcpu_svm *svm = to_svm(vcpu);
531         u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
532         u32 id = kvm_xapic_id(vcpu->arch.apic);
533
534         if (ldr == svm->ldr_reg)
535                 return 0;
536
537         avic_invalidate_logical_id_entry(vcpu);
538
539         if (ldr)
540                 ret = avic_ldr_write(vcpu, id, ldr);
541
542         if (!ret)
543                 svm->ldr_reg = ldr;
544
545         return ret;
546 }
547
548 static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
549 {
550         struct vcpu_svm *svm = to_svm(vcpu);
551         u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
552
553         if (svm->dfr_reg == dfr)
554                 return;
555
556         avic_invalidate_logical_id_entry(vcpu);
557         svm->dfr_reg = dfr;
558 }
559
560 static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
561 {
562         u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 &
563                                 AVIC_UNACCEL_ACCESS_OFFSET_MASK;
564
565         switch (offset) {
566         case APIC_LDR:
567                 if (avic_handle_ldr_update(vcpu))
568                         return 0;
569                 break;
570         case APIC_DFR:
571                 avic_handle_dfr_update(vcpu);
572                 break;
573         default:
574                 break;
575         }
576
577         kvm_apic_write_nodecode(vcpu, offset);
578         return 1;
579 }
580
581 static bool is_avic_unaccelerated_access_trap(u32 offset)
582 {
583         bool ret = false;
584
585         switch (offset) {
586         case APIC_ID:
587         case APIC_EOI:
588         case APIC_RRR:
589         case APIC_LDR:
590         case APIC_DFR:
591         case APIC_SPIV:
592         case APIC_ESR:
593         case APIC_ICR:
594         case APIC_LVTT:
595         case APIC_LVTTHMR:
596         case APIC_LVTPC:
597         case APIC_LVT0:
598         case APIC_LVT1:
599         case APIC_LVTERR:
600         case APIC_TMICT:
601         case APIC_TDCR:
602                 ret = true;
603                 break;
604         default:
605                 break;
606         }
607         return ret;
608 }
609
610 int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
611 {
612         struct vcpu_svm *svm = to_svm(vcpu);
613         int ret = 0;
614         u32 offset = svm->vmcb->control.exit_info_1 &
615                      AVIC_UNACCEL_ACCESS_OFFSET_MASK;
616         u32 vector = svm->vmcb->control.exit_info_2 &
617                      AVIC_UNACCEL_ACCESS_VECTOR_MASK;
618         bool write = (svm->vmcb->control.exit_info_1 >> 32) &
619                      AVIC_UNACCEL_ACCESS_WRITE_MASK;
620         bool trap = is_avic_unaccelerated_access_trap(offset);
621
622         trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
623                                             trap, write, vector);
624         if (trap) {
625                 /* Handling Trap */
626                 WARN_ONCE(!write, "svm: Handling trap read.\n");
627                 ret = avic_unaccel_trap_write(vcpu);
628         } else {
629                 /* Handling Fault */
630                 ret = kvm_emulate_instruction(vcpu, 0);
631         }
632
633         return ret;
634 }
635
636 int avic_init_vcpu(struct vcpu_svm *svm)
637 {
638         int ret;
639         struct kvm_vcpu *vcpu = &svm->vcpu;
640
641         if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
642                 return 0;
643
644         ret = avic_init_backing_page(vcpu);
645         if (ret)
646                 return ret;
647
648         INIT_LIST_HEAD(&svm->ir_list);
649         spin_lock_init(&svm->ir_list_lock);
650         svm->dfr_reg = APIC_DFR_FLAT;
651
652         return ret;
653 }
654
655 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
656 {
657         avic_handle_dfr_update(vcpu);
658         avic_handle_ldr_update(vcpu);
659 }
660
661 static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
662 {
663         int ret = 0;
664         unsigned long flags;
665         struct amd_svm_iommu_ir *ir;
666         struct vcpu_svm *svm = to_svm(vcpu);
667
668         if (!kvm_arch_has_assigned_device(vcpu->kvm))
669                 return 0;
670
671         /*
672          * Here, we go through the per-vcpu ir_list to update all existing
673          * interrupt remapping table entry targeting this vcpu.
674          */
675         spin_lock_irqsave(&svm->ir_list_lock, flags);
676
677         if (list_empty(&svm->ir_list))
678                 goto out;
679
680         list_for_each_entry(ir, &svm->ir_list, node) {
681                 if (activate)
682                         ret = amd_iommu_activate_guest_mode(ir->data);
683                 else
684                         ret = amd_iommu_deactivate_guest_mode(ir->data);
685                 if (ret)
686                         break;
687         }
688 out:
689         spin_unlock_irqrestore(&svm->ir_list_lock, flags);
690         return ret;
691 }
692
693 static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
694 {
695         unsigned long flags;
696         struct amd_svm_iommu_ir *cur;
697
698         spin_lock_irqsave(&svm->ir_list_lock, flags);
699         list_for_each_entry(cur, &svm->ir_list, node) {
700                 if (cur->data != pi->ir_data)
701                         continue;
702                 list_del(&cur->node);
703                 kfree(cur);
704                 break;
705         }
706         spin_unlock_irqrestore(&svm->ir_list_lock, flags);
707 }
708
709 static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
710 {
711         int ret = 0;
712         unsigned long flags;
713         struct amd_svm_iommu_ir *ir;
714
715         /**
716          * In some cases, the existing irte is updated and re-set,
717          * so we need to check here if it's already been * added
718          * to the ir_list.
719          */
720         if (pi->ir_data && (pi->prev_ga_tag != 0)) {
721                 struct kvm *kvm = svm->vcpu.kvm;
722                 u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
723                 struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
724                 struct vcpu_svm *prev_svm;
725
726                 if (!prev_vcpu) {
727                         ret = -EINVAL;
728                         goto out;
729                 }
730
731                 prev_svm = to_svm(prev_vcpu);
732                 svm_ir_list_del(prev_svm, pi);
733         }
734
735         /**
736          * Allocating new amd_iommu_pi_data, which will get
737          * add to the per-vcpu ir_list.
738          */
739         ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
740         if (!ir) {
741                 ret = -ENOMEM;
742                 goto out;
743         }
744         ir->data = pi->ir_data;
745
746         spin_lock_irqsave(&svm->ir_list_lock, flags);
747         list_add(&ir->node, &svm->ir_list);
748         spin_unlock_irqrestore(&svm->ir_list_lock, flags);
749 out:
750         return ret;
751 }
752
753 /*
754  * Note:
755  * The HW cannot support posting multicast/broadcast
756  * interrupts to a vCPU. So, we still use legacy interrupt
757  * remapping for these kind of interrupts.
758  *
759  * For lowest-priority interrupts, we only support
760  * those with single CPU as the destination, e.g. user
761  * configures the interrupts via /proc/irq or uses
762  * irqbalance to make the interrupts single-CPU.
763  */
764 static int
765 get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
766                  struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
767 {
768         struct kvm_lapic_irq irq;
769         struct kvm_vcpu *vcpu = NULL;
770
771         kvm_set_msi_irq(kvm, e, &irq);
772
773         if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
774             !kvm_irq_is_postable(&irq)) {
775                 pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
776                          __func__, irq.vector);
777                 return -1;
778         }
779
780         pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
781                  irq.vector);
782         *svm = to_svm(vcpu);
783         vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
784         vcpu_info->vector = irq.vector;
785
786         return 0;
787 }
788
789 /*
790  * avic_pi_update_irte - set IRTE for Posted-Interrupts
791  *
792  * @kvm: kvm
793  * @host_irq: host irq of the interrupt
794  * @guest_irq: gsi of the interrupt
795  * @set: set or unset PI
796  * returns 0 on success, < 0 on failure
797  */
798 int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
799                         uint32_t guest_irq, bool set)
800 {
801         struct kvm_kernel_irq_routing_entry *e;
802         struct kvm_irq_routing_table *irq_rt;
803         int idx, ret = 0;
804
805         if (!kvm_arch_has_assigned_device(kvm) ||
806             !irq_remapping_cap(IRQ_POSTING_CAP))
807                 return 0;
808
809         pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
810                  __func__, host_irq, guest_irq, set);
811
812         idx = srcu_read_lock(&kvm->irq_srcu);
813         irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
814
815         if (guest_irq >= irq_rt->nr_rt_entries ||
816                 hlist_empty(&irq_rt->map[guest_irq])) {
817                 pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
818                              guest_irq, irq_rt->nr_rt_entries);
819                 goto out;
820         }
821
822         hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
823                 struct vcpu_data vcpu_info;
824                 struct vcpu_svm *svm = NULL;
825
826                 if (e->type != KVM_IRQ_ROUTING_MSI)
827                         continue;
828
829                 /**
830                  * Here, we setup with legacy mode in the following cases:
831                  * 1. When cannot target interrupt to a specific vcpu.
832                  * 2. Unsetting posted interrupt.
833                  * 3. APIC virtualization is disabled for the vcpu.
834                  * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
835                  */
836                 if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
837                     kvm_vcpu_apicv_active(&svm->vcpu)) {
838                         struct amd_iommu_pi_data pi;
839
840                         /* Try to enable guest_mode in IRTE */
841                         pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
842                                             AVIC_HPA_MASK);
843                         pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
844                                                      svm->vcpu.vcpu_id);
845                         pi.is_guest_mode = true;
846                         pi.vcpu_data = &vcpu_info;
847                         ret = irq_set_vcpu_affinity(host_irq, &pi);
848
849                         /**
850                          * Here, we successfully setting up vcpu affinity in
851                          * IOMMU guest mode. Now, we need to store the posted
852                          * interrupt information in a per-vcpu ir_list so that
853                          * we can reference to them directly when we update vcpu
854                          * scheduling information in IOMMU irte.
855                          */
856                         if (!ret && pi.is_guest_mode)
857                                 svm_ir_list_add(svm, &pi);
858                 } else {
859                         /* Use legacy mode in IRTE */
860                         struct amd_iommu_pi_data pi;
861
862                         /**
863                          * Here, pi is used to:
864                          * - Tell IOMMU to use legacy mode for this interrupt.
865                          * - Retrieve ga_tag of prior interrupt remapping data.
866                          */
867                         pi.prev_ga_tag = 0;
868                         pi.is_guest_mode = false;
869                         ret = irq_set_vcpu_affinity(host_irq, &pi);
870
871                         /**
872                          * Check if the posted interrupt was previously
873                          * setup with the guest_mode by checking if the ga_tag
874                          * was cached. If so, we need to clean up the per-vcpu
875                          * ir_list.
876                          */
877                         if (!ret && pi.prev_ga_tag) {
878                                 int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
879                                 struct kvm_vcpu *vcpu;
880
881                                 vcpu = kvm_get_vcpu_by_id(kvm, id);
882                                 if (vcpu)
883                                         svm_ir_list_del(to_svm(vcpu), &pi);
884                         }
885                 }
886
887                 if (!ret && svm) {
888                         trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
889                                                  e->gsi, vcpu_info.vector,
890                                                  vcpu_info.pi_desc_addr, set);
891                 }
892
893                 if (ret < 0) {
894                         pr_err("%s: failed to update PI IRTE\n", __func__);
895                         goto out;
896                 }
897         }
898
899         ret = 0;
900 out:
901         srcu_read_unlock(&kvm->irq_srcu, idx);
902         return ret;
903 }
904
905 bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
906 {
907         ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
908                           BIT(APICV_INHIBIT_REASON_ABSENT) |
909                           BIT(APICV_INHIBIT_REASON_HYPERV) |
910                           BIT(APICV_INHIBIT_REASON_NESTED) |
911                           BIT(APICV_INHIBIT_REASON_IRQWIN) |
912                           BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
913                           BIT(APICV_INHIBIT_REASON_X2APIC) |
914                           BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
915                           BIT(APICV_INHIBIT_REASON_SEV)      |
916                           BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
917                           BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
918
919         return supported & BIT(reason);
920 }
921
922
923 static inline int
924 avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
925 {
926         int ret = 0;
927         unsigned long flags;
928         struct amd_svm_iommu_ir *ir;
929         struct vcpu_svm *svm = to_svm(vcpu);
930
931         if (!kvm_arch_has_assigned_device(vcpu->kvm))
932                 return 0;
933
934         /*
935          * Here, we go through the per-vcpu ir_list to update all existing
936          * interrupt remapping table entry targeting this vcpu.
937          */
938         spin_lock_irqsave(&svm->ir_list_lock, flags);
939
940         if (list_empty(&svm->ir_list))
941                 goto out;
942
943         list_for_each_entry(ir, &svm->ir_list, node) {
944                 ret = amd_iommu_update_ga(cpu, r, ir->data);
945                 if (ret)
946                         break;
947         }
948 out:
949         spin_unlock_irqrestore(&svm->ir_list_lock, flags);
950         return ret;
951 }
952
953 void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
954 {
955         u64 entry;
956         int h_physical_id = kvm_cpu_get_apicid(cpu);
957         struct vcpu_svm *svm = to_svm(vcpu);
958
959         lockdep_assert_preemption_disabled();
960
961         if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
962                 return;
963
964         /*
965          * No need to update anything if the vCPU is blocking, i.e. if the vCPU
966          * is being scheduled in after being preempted.  The CPU entries in the
967          * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
968          * If the vCPU was migrated, its new CPU value will be stuffed when the
969          * vCPU unblocks.
970          */
971         if (kvm_vcpu_is_blocking(vcpu))
972                 return;
973
974         entry = READ_ONCE(*(svm->avic_physical_id_cache));
975         WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
976
977         entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
978         entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
979         entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
980
981         WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
982         avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
983 }
984
985 void avic_vcpu_put(struct kvm_vcpu *vcpu)
986 {
987         u64 entry;
988         struct vcpu_svm *svm = to_svm(vcpu);
989
990         lockdep_assert_preemption_disabled();
991
992         entry = READ_ONCE(*(svm->avic_physical_id_cache));
993
994         /* Nothing to do if IsRunning == '0' due to vCPU blocking. */
995         if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
996                 return;
997
998         avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
999
1000         entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1001         WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
1002 }
1003
1004
1005 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
1006 {
1007         struct vcpu_svm *svm = to_svm(vcpu);
1008         struct vmcb *vmcb = svm->vmcb01.ptr;
1009         bool activated = kvm_vcpu_apicv_active(vcpu);
1010
1011         if (!enable_apicv)
1012                 return;
1013
1014         if (activated) {
1015                 /**
1016                  * During AVIC temporary deactivation, guest could update
1017                  * APIC ID, DFR and LDR registers, which would not be trapped
1018                  * by avic_unaccelerated_access_interception(). In this case,
1019                  * we need to check and update the AVIC logical APIC ID table
1020                  * accordingly before re-activating.
1021                  */
1022                 avic_apicv_post_state_restore(vcpu);
1023                 vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
1024         } else {
1025                 vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
1026         }
1027         vmcb_mark_dirty(vmcb, VMCB_AVIC);
1028
1029         if (activated)
1030                 avic_vcpu_load(vcpu, vcpu->cpu);
1031         else
1032                 avic_vcpu_put(vcpu);
1033
1034         avic_set_pi_irte_mode(vcpu, activated);
1035 }
1036
1037 void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
1038 {
1039         if (!kvm_vcpu_apicv_active(vcpu))
1040                 return;
1041
1042        /*
1043         * Unload the AVIC when the vCPU is about to block, _before_
1044         * the vCPU actually blocks.
1045         *
1046         * Any IRQs that arrive before IsRunning=0 will not cause an
1047         * incomplete IPI vmexit on the source, therefore vIRR will also
1048         * be checked by kvm_vcpu_check_block() before blocking.  The
1049         * memory barrier implicit in set_current_state orders writing
1050         * IsRunning=0 before reading the vIRR.  The processor needs a
1051         * matching memory barrier on interrupt delivery between writing
1052         * IRR and reading IsRunning; the lack of this barrier might be
1053         * the cause of errata #1235).
1054         */
1055         avic_vcpu_put(vcpu);
1056 }
1057
1058 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
1059 {
1060         if (!kvm_vcpu_apicv_active(vcpu))
1061                 return;
1062
1063         avic_vcpu_load(vcpu, vcpu->cpu);
1064 }
1065
1066 /*
1067  * Note:
1068  * - The module param avic enable both xAPIC and x2APIC mode.
1069  * - Hypervisor can support both xAVIC and x2AVIC in the same guest.
1070  * - The mode can be switched at run-time.
1071  */
1072 bool avic_hardware_setup(struct kvm_x86_ops *x86_ops)
1073 {
1074         if (!npt_enabled)
1075                 return false;
1076
1077         if (boot_cpu_has(X86_FEATURE_AVIC)) {
1078                 avic_mode = AVIC_MODE_X1;
1079                 pr_info("AVIC enabled\n");
1080         } else if (force_avic) {
1081                 /*
1082                  * Some older systems does not advertise AVIC support.
1083                  * See Revision Guide for specific AMD processor for more detail.
1084                  */
1085                 avic_mode = AVIC_MODE_X1;
1086                 pr_warn("AVIC is not supported in CPUID but force enabled");
1087                 pr_warn("Your system might crash and burn");
1088         }
1089
1090         /* AVIC is a prerequisite for x2AVIC. */
1091         if (boot_cpu_has(X86_FEATURE_X2AVIC)) {
1092                 if (avic_mode == AVIC_MODE_X1) {
1093                         avic_mode = AVIC_MODE_X2;
1094                         pr_info("x2AVIC enabled\n");
1095                 } else {
1096                         pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled");
1097                         pr_warn(FW_BUG "Try enable AVIC using force_avic option");
1098                 }
1099         }
1100
1101         if (avic_mode != AVIC_MODE_NONE)
1102                 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
1103
1104         return !!avic_mode;
1105 }