1 // SPDX-License-Identifier: GPL-2.0-only
3 * Kernel-based Virtual Machine driver for Linux
7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11 * Yaniv Kamay <yaniv@qumranet.com>
12 * Avi Kivity <avi@qumranet.com>
15 #define pr_fmt(fmt) "SVM: " fmt
17 #include <linux/kvm_types.h>
18 #include <linux/hashtable.h>
19 #include <linux/amd-iommu.h>
20 #include <linux/kvm_host.h>
22 #include <asm/irq_remapping.h>
30 /* AVIC GATAG is encoded using VM and VCPU IDs */
31 #define AVIC_VCPU_ID_BITS 8
32 #define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1)
34 #define AVIC_VM_ID_BITS 24
35 #define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS)
36 #define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1)
38 #define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
39 (y & AVIC_VCPU_ID_MASK))
40 #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
41 #define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
43 static bool force_avic;
44 module_param_unsafe(force_avic, bool, 0444);
47 * This hash table is used to map VM_ID to a struct kvm_svm,
48 * when handling AMD IOMMU GALOG notification to schedule in
51 #define SVM_VM_DATA_HASH_BITS 8
52 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
53 static u32 next_vm_id = 0;
54 static bool next_vm_id_wrapped = 0;
55 static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
56 enum avic_modes avic_mode;
59 * This is a wrapper of struct amd_iommu_ir_data.
61 struct amd_svm_iommu_ir {
62 struct list_head node; /* Used by SVM for per-vcpu ir_list */
63 void *data; /* Storing pointer to struct amd_ir_data */
68 * This function is called from IOMMU driver to notify
69 * SVM to schedule in a particular vCPU of a particular VM.
71 int avic_ga_log_notifier(u32 ga_tag)
74 struct kvm_svm *kvm_svm;
75 struct kvm_vcpu *vcpu = NULL;
76 u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
77 u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
79 pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
80 trace_kvm_avic_ga_log(vm_id, vcpu_id);
82 spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
83 hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
84 if (kvm_svm->avic_vm_id != vm_id)
86 vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
89 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
92 * At this point, the IOMMU should have already set the pending
93 * bit in the vAPIC backing page. So, we just need to schedule
97 kvm_vcpu_wake_up(vcpu);
102 void avic_vm_destroy(struct kvm *kvm)
105 struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
110 if (kvm_svm->avic_logical_id_table_page)
111 __free_page(kvm_svm->avic_logical_id_table_page);
112 if (kvm_svm->avic_physical_id_table_page)
113 __free_page(kvm_svm->avic_physical_id_table_page);
115 spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
116 hash_del(&kvm_svm->hnode);
117 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
120 int avic_vm_init(struct kvm *kvm)
124 struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
133 /* Allocating physical APIC ID table (4KB) */
134 p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
138 kvm_svm->avic_physical_id_table_page = p_page;
140 /* Allocating logical APIC ID table (4KB) */
141 l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
145 kvm_svm->avic_logical_id_table_page = l_page;
147 spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
149 vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
150 if (vm_id == 0) { /* id is 1-based, zero is not okay */
151 next_vm_id_wrapped = 1;
154 /* Is it still in use? Only possible if wrapped at least once */
155 if (next_vm_id_wrapped) {
156 hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
157 if (k2->avic_vm_id == vm_id)
161 kvm_svm->avic_vm_id = vm_id;
162 hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
163 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
168 avic_vm_destroy(kvm);
172 void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
174 struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
175 phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
176 phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
177 phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
179 vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
180 vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
181 vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
182 vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
183 vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
185 if (kvm_apicv_activated(svm->vcpu.kvm))
186 vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
188 vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
191 static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
194 u64 *avic_physical_id_table;
195 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
197 if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
200 avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
202 return &avic_physical_id_table[index];
207 * AVIC hardware walks the nested page table to check permissions,
208 * but does not use the SPA address specified in the leaf page
209 * table entry since it uses address in the AVIC_BACKING_PAGE pointer
210 * field of the VMCB. Therefore, we set up the
211 * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
213 static int avic_alloc_access_page(struct kvm *kvm)
218 mutex_lock(&kvm->slots_lock);
220 if (kvm->arch.apic_access_memslot_enabled)
223 ret = __x86_set_memory_region(kvm,
224 APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
225 APIC_DEFAULT_PHYS_BASE,
232 kvm->arch.apic_access_memslot_enabled = true;
234 mutex_unlock(&kvm->slots_lock);
238 static int avic_init_backing_page(struct kvm_vcpu *vcpu)
240 u64 *entry, new_entry;
241 int id = vcpu->vcpu_id;
242 struct vcpu_svm *svm = to_svm(vcpu);
244 if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
247 if (!vcpu->arch.apic->regs)
250 if (kvm_apicv_activated(vcpu->kvm)) {
253 ret = avic_alloc_access_page(vcpu->kvm);
258 svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
260 /* Setting AVIC backing page address in the phy APIC ID table */
261 entry = avic_get_physical_id_entry(vcpu, id);
265 new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
266 AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
267 AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
268 WRITE_ONCE(*entry, new_entry);
270 svm->avic_physical_id_cache = entry;
275 void avic_ring_doorbell(struct kvm_vcpu *vcpu)
278 * Note, the vCPU could get migrated to a different pCPU at any point,
279 * which could result in signalling the wrong/previous pCPU. But if
280 * that happens the vCPU is guaranteed to do a VMRUN (after being
281 * migrated) and thus will process pending interrupts, i.e. a doorbell
282 * is not needed (and the spurious one is harmless).
284 int cpu = READ_ONCE(vcpu->cpu);
286 if (cpu != get_cpu())
287 wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
292 * A fast-path version of avic_kick_target_vcpus(), which attempts to match
293 * destination APIC ID to vCPU without looping through all vCPUs.
295 static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
296 u32 icrl, u32 icrh, u32 index)
298 u32 l1_physical_id, dest;
299 struct kvm_vcpu *target_vcpu;
300 int dest_mode = icrl & APIC_DEST_MASK;
301 int shorthand = icrl & APIC_SHORT_MASK;
302 struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
304 if (shorthand != APIC_DEST_NOSHORT)
307 if (apic_x2apic_mode(source))
310 dest = GET_XAPIC_DEST_FIELD(icrh);
312 if (dest_mode == APIC_DEST_PHYSICAL) {
313 /* broadcast destination, use slow path */
314 if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST)
316 if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
319 l1_physical_id = dest;
321 if (WARN_ON_ONCE(l1_physical_id != index))
328 if (apic_x2apic_mode(source)) {
329 /* 16 bit dest mask, 16 bit cluster id */
330 bitmap = dest & 0xFFFF0000;
331 cluster = (dest >> 16) << 4;
332 } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
337 /* 4 bit desk mask, 4 bit cluster id */
339 cluster = (dest >> 4) << 2;
342 if (unlikely(!bitmap))
343 /* guest bug: nobody to send the logical interrupt to */
346 if (!is_power_of_2(bitmap))
347 /* multiple logical destinations, use slow path */
350 logid_index = cluster + __ffs(bitmap);
352 if (apic_x2apic_mode(source)) {
353 l1_physical_id = logid_index;
355 u32 *avic_logical_id_table =
356 page_address(kvm_svm->avic_logical_id_table_page);
358 u32 logid_entry = avic_logical_id_table[logid_index];
360 if (WARN_ON_ONCE(index != logid_index))
363 /* guest bug: non existing/reserved logical destination */
364 if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
367 l1_physical_id = logid_entry &
368 AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
372 target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id);
373 if (unlikely(!target_vcpu))
374 /* guest bug: non existing vCPU is a target of this IPI*/
377 target_vcpu->arch.apic->irr_pending = true;
378 svm_complete_interrupt_delivery(target_vcpu,
379 icrl & APIC_MODE_MASK,
380 icrl & APIC_INT_LEVELTRIG,
381 icrl & APIC_VECTOR_MASK);
385 static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
386 u32 icrl, u32 icrh, u32 index)
389 struct kvm_vcpu *vcpu;
391 if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index))
394 trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index);
397 * Wake any target vCPUs that are blocking, i.e. waiting for a wake
398 * event. There's no need to signal doorbells, as hardware has handled
399 * vCPUs that were in guest at the time of the IPI, and vCPUs that have
400 * since entered the guest will have processed pending IRQs at VMRUN.
402 kvm_for_each_vcpu(i, vcpu, kvm) {
403 if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
404 GET_XAPIC_DEST_FIELD(icrh),
405 icrl & APIC_DEST_MASK)) {
406 vcpu->arch.apic->irr_pending = true;
407 svm_complete_interrupt_delivery(vcpu,
408 icrl & APIC_MODE_MASK,
409 icrl & APIC_INT_LEVELTRIG,
410 icrl & APIC_VECTOR_MASK);
415 int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
417 struct vcpu_svm *svm = to_svm(vcpu);
418 u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
419 u32 icrl = svm->vmcb->control.exit_info_1;
420 u32 id = svm->vmcb->control.exit_info_2 >> 32;
421 u32 index = svm->vmcb->control.exit_info_2 & 0x1FF;
422 struct kvm_lapic *apic = vcpu->arch.apic;
424 trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
427 case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
429 * Emulate IPIs that are not handled by AVIC hardware, which
430 * only virtualizes Fixed, Edge-Triggered INTRs. The exit is
431 * a trap, e.g. ICR holds the correct value and RIP has been
432 * advanced, KVM is responsible only for emulating the IPI.
433 * Sadly, hardware may sometimes leave the BUSY flag set, in
434 * which case KVM needs to emulate the ICR write as well in
435 * order to clear the BUSY flag.
437 if (icrl & APIC_ICR_BUSY)
438 kvm_apic_write_nodecode(vcpu, APIC_ICR);
440 kvm_apic_send_ipi(apic, icrl, icrh);
442 case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
444 * At this point, we expect that the AVIC HW has already
445 * set the appropriate IRR bits on the valid target
446 * vcpus. So, we just need to kick the appropriate vcpu.
448 avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
450 case AVIC_IPI_FAILURE_INVALID_TARGET:
452 case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
453 WARN_ONCE(1, "Invalid backing page\n");
456 pr_err("Unknown IPI interception\n");
462 unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
464 if (is_guest_mode(vcpu))
465 return APICV_INHIBIT_REASON_NESTED;
469 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
471 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
473 u32 *logical_apic_id_table;
474 int dlid = GET_APIC_LOGICAL_ID(ldr);
479 if (flat) { /* flat */
480 index = ffs(dlid) - 1;
483 } else { /* cluster */
484 int cluster = (dlid & 0xf0) >> 4;
485 int apic = ffs(dlid & 0x0f) - 1;
487 if ((apic < 0) || (apic > 7) ||
490 index = (cluster << 2) + apic;
493 logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
495 return &logical_apic_id_table[index];
498 static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
501 u32 *entry, new_entry;
503 flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
504 entry = avic_get_logical_id_entry(vcpu, ldr, flat);
508 new_entry = READ_ONCE(*entry);
509 new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
510 new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
511 new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
512 WRITE_ONCE(*entry, new_entry);
517 static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
519 struct vcpu_svm *svm = to_svm(vcpu);
520 bool flat = svm->dfr_reg == APIC_DFR_FLAT;
521 u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
524 clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
527 static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
530 struct vcpu_svm *svm = to_svm(vcpu);
531 u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
532 u32 id = kvm_xapic_id(vcpu->arch.apic);
534 if (ldr == svm->ldr_reg)
537 avic_invalidate_logical_id_entry(vcpu);
540 ret = avic_ldr_write(vcpu, id, ldr);
548 static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
550 struct vcpu_svm *svm = to_svm(vcpu);
551 u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
553 if (svm->dfr_reg == dfr)
556 avic_invalidate_logical_id_entry(vcpu);
560 static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
562 u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 &
563 AVIC_UNACCEL_ACCESS_OFFSET_MASK;
567 if (avic_handle_ldr_update(vcpu))
571 avic_handle_dfr_update(vcpu);
577 kvm_apic_write_nodecode(vcpu, offset);
581 static bool is_avic_unaccelerated_access_trap(u32 offset)
610 int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
612 struct vcpu_svm *svm = to_svm(vcpu);
614 u32 offset = svm->vmcb->control.exit_info_1 &
615 AVIC_UNACCEL_ACCESS_OFFSET_MASK;
616 u32 vector = svm->vmcb->control.exit_info_2 &
617 AVIC_UNACCEL_ACCESS_VECTOR_MASK;
618 bool write = (svm->vmcb->control.exit_info_1 >> 32) &
619 AVIC_UNACCEL_ACCESS_WRITE_MASK;
620 bool trap = is_avic_unaccelerated_access_trap(offset);
622 trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
623 trap, write, vector);
626 WARN_ONCE(!write, "svm: Handling trap read.\n");
627 ret = avic_unaccel_trap_write(vcpu);
630 ret = kvm_emulate_instruction(vcpu, 0);
636 int avic_init_vcpu(struct vcpu_svm *svm)
639 struct kvm_vcpu *vcpu = &svm->vcpu;
641 if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
644 ret = avic_init_backing_page(vcpu);
648 INIT_LIST_HEAD(&svm->ir_list);
649 spin_lock_init(&svm->ir_list_lock);
650 svm->dfr_reg = APIC_DFR_FLAT;
655 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
657 avic_handle_dfr_update(vcpu);
658 avic_handle_ldr_update(vcpu);
661 static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
665 struct amd_svm_iommu_ir *ir;
666 struct vcpu_svm *svm = to_svm(vcpu);
668 if (!kvm_arch_has_assigned_device(vcpu->kvm))
672 * Here, we go through the per-vcpu ir_list to update all existing
673 * interrupt remapping table entry targeting this vcpu.
675 spin_lock_irqsave(&svm->ir_list_lock, flags);
677 if (list_empty(&svm->ir_list))
680 list_for_each_entry(ir, &svm->ir_list, node) {
682 ret = amd_iommu_activate_guest_mode(ir->data);
684 ret = amd_iommu_deactivate_guest_mode(ir->data);
689 spin_unlock_irqrestore(&svm->ir_list_lock, flags);
693 static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
696 struct amd_svm_iommu_ir *cur;
698 spin_lock_irqsave(&svm->ir_list_lock, flags);
699 list_for_each_entry(cur, &svm->ir_list, node) {
700 if (cur->data != pi->ir_data)
702 list_del(&cur->node);
706 spin_unlock_irqrestore(&svm->ir_list_lock, flags);
709 static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
713 struct amd_svm_iommu_ir *ir;
716 * In some cases, the existing irte is updated and re-set,
717 * so we need to check here if it's already been * added
720 if (pi->ir_data && (pi->prev_ga_tag != 0)) {
721 struct kvm *kvm = svm->vcpu.kvm;
722 u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
723 struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
724 struct vcpu_svm *prev_svm;
731 prev_svm = to_svm(prev_vcpu);
732 svm_ir_list_del(prev_svm, pi);
736 * Allocating new amd_iommu_pi_data, which will get
737 * add to the per-vcpu ir_list.
739 ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
744 ir->data = pi->ir_data;
746 spin_lock_irqsave(&svm->ir_list_lock, flags);
747 list_add(&ir->node, &svm->ir_list);
748 spin_unlock_irqrestore(&svm->ir_list_lock, flags);
755 * The HW cannot support posting multicast/broadcast
756 * interrupts to a vCPU. So, we still use legacy interrupt
757 * remapping for these kind of interrupts.
759 * For lowest-priority interrupts, we only support
760 * those with single CPU as the destination, e.g. user
761 * configures the interrupts via /proc/irq or uses
762 * irqbalance to make the interrupts single-CPU.
765 get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
766 struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
768 struct kvm_lapic_irq irq;
769 struct kvm_vcpu *vcpu = NULL;
771 kvm_set_msi_irq(kvm, e, &irq);
773 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
774 !kvm_irq_is_postable(&irq)) {
775 pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
776 __func__, irq.vector);
780 pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
783 vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
784 vcpu_info->vector = irq.vector;
790 * avic_pi_update_irte - set IRTE for Posted-Interrupts
793 * @host_irq: host irq of the interrupt
794 * @guest_irq: gsi of the interrupt
795 * @set: set or unset PI
796 * returns 0 on success, < 0 on failure
798 int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
799 uint32_t guest_irq, bool set)
801 struct kvm_kernel_irq_routing_entry *e;
802 struct kvm_irq_routing_table *irq_rt;
805 if (!kvm_arch_has_assigned_device(kvm) ||
806 !irq_remapping_cap(IRQ_POSTING_CAP))
809 pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
810 __func__, host_irq, guest_irq, set);
812 idx = srcu_read_lock(&kvm->irq_srcu);
813 irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
815 if (guest_irq >= irq_rt->nr_rt_entries ||
816 hlist_empty(&irq_rt->map[guest_irq])) {
817 pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
818 guest_irq, irq_rt->nr_rt_entries);
822 hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
823 struct vcpu_data vcpu_info;
824 struct vcpu_svm *svm = NULL;
826 if (e->type != KVM_IRQ_ROUTING_MSI)
830 * Here, we setup with legacy mode in the following cases:
831 * 1. When cannot target interrupt to a specific vcpu.
832 * 2. Unsetting posted interrupt.
833 * 3. APIC virtualization is disabled for the vcpu.
834 * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
836 if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
837 kvm_vcpu_apicv_active(&svm->vcpu)) {
838 struct amd_iommu_pi_data pi;
840 /* Try to enable guest_mode in IRTE */
841 pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
843 pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
845 pi.is_guest_mode = true;
846 pi.vcpu_data = &vcpu_info;
847 ret = irq_set_vcpu_affinity(host_irq, &pi);
850 * Here, we successfully setting up vcpu affinity in
851 * IOMMU guest mode. Now, we need to store the posted
852 * interrupt information in a per-vcpu ir_list so that
853 * we can reference to them directly when we update vcpu
854 * scheduling information in IOMMU irte.
856 if (!ret && pi.is_guest_mode)
857 svm_ir_list_add(svm, &pi);
859 /* Use legacy mode in IRTE */
860 struct amd_iommu_pi_data pi;
863 * Here, pi is used to:
864 * - Tell IOMMU to use legacy mode for this interrupt.
865 * - Retrieve ga_tag of prior interrupt remapping data.
868 pi.is_guest_mode = false;
869 ret = irq_set_vcpu_affinity(host_irq, &pi);
872 * Check if the posted interrupt was previously
873 * setup with the guest_mode by checking if the ga_tag
874 * was cached. If so, we need to clean up the per-vcpu
877 if (!ret && pi.prev_ga_tag) {
878 int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
879 struct kvm_vcpu *vcpu;
881 vcpu = kvm_get_vcpu_by_id(kvm, id);
883 svm_ir_list_del(to_svm(vcpu), &pi);
888 trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
889 e->gsi, vcpu_info.vector,
890 vcpu_info.pi_desc_addr, set);
894 pr_err("%s: failed to update PI IRTE\n", __func__);
901 srcu_read_unlock(&kvm->irq_srcu, idx);
905 bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
907 ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
908 BIT(APICV_INHIBIT_REASON_ABSENT) |
909 BIT(APICV_INHIBIT_REASON_HYPERV) |
910 BIT(APICV_INHIBIT_REASON_NESTED) |
911 BIT(APICV_INHIBIT_REASON_IRQWIN) |
912 BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
913 BIT(APICV_INHIBIT_REASON_X2APIC) |
914 BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
915 BIT(APICV_INHIBIT_REASON_SEV) |
916 BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
917 BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
919 return supported & BIT(reason);
924 avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
928 struct amd_svm_iommu_ir *ir;
929 struct vcpu_svm *svm = to_svm(vcpu);
931 if (!kvm_arch_has_assigned_device(vcpu->kvm))
935 * Here, we go through the per-vcpu ir_list to update all existing
936 * interrupt remapping table entry targeting this vcpu.
938 spin_lock_irqsave(&svm->ir_list_lock, flags);
940 if (list_empty(&svm->ir_list))
943 list_for_each_entry(ir, &svm->ir_list, node) {
944 ret = amd_iommu_update_ga(cpu, r, ir->data);
949 spin_unlock_irqrestore(&svm->ir_list_lock, flags);
953 void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
956 int h_physical_id = kvm_cpu_get_apicid(cpu);
957 struct vcpu_svm *svm = to_svm(vcpu);
959 lockdep_assert_preemption_disabled();
961 if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
965 * No need to update anything if the vCPU is blocking, i.e. if the vCPU
966 * is being scheduled in after being preempted. The CPU entries in the
967 * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
968 * If the vCPU was migrated, its new CPU value will be stuffed when the
971 if (kvm_vcpu_is_blocking(vcpu))
974 entry = READ_ONCE(*(svm->avic_physical_id_cache));
975 WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
977 entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
978 entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
979 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
981 WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
982 avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
985 void avic_vcpu_put(struct kvm_vcpu *vcpu)
988 struct vcpu_svm *svm = to_svm(vcpu);
990 lockdep_assert_preemption_disabled();
992 entry = READ_ONCE(*(svm->avic_physical_id_cache));
994 /* Nothing to do if IsRunning == '0' due to vCPU blocking. */
995 if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
998 avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
1000 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1001 WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
1005 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
1007 struct vcpu_svm *svm = to_svm(vcpu);
1008 struct vmcb *vmcb = svm->vmcb01.ptr;
1009 bool activated = kvm_vcpu_apicv_active(vcpu);
1016 * During AVIC temporary deactivation, guest could update
1017 * APIC ID, DFR and LDR registers, which would not be trapped
1018 * by avic_unaccelerated_access_interception(). In this case,
1019 * we need to check and update the AVIC logical APIC ID table
1020 * accordingly before re-activating.
1022 avic_apicv_post_state_restore(vcpu);
1023 vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
1025 vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
1027 vmcb_mark_dirty(vmcb, VMCB_AVIC);
1030 avic_vcpu_load(vcpu, vcpu->cpu);
1032 avic_vcpu_put(vcpu);
1034 avic_set_pi_irte_mode(vcpu, activated);
1037 void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
1039 if (!kvm_vcpu_apicv_active(vcpu))
1043 * Unload the AVIC when the vCPU is about to block, _before_
1044 * the vCPU actually blocks.
1046 * Any IRQs that arrive before IsRunning=0 will not cause an
1047 * incomplete IPI vmexit on the source, therefore vIRR will also
1048 * be checked by kvm_vcpu_check_block() before blocking. The
1049 * memory barrier implicit in set_current_state orders writing
1050 * IsRunning=0 before reading the vIRR. The processor needs a
1051 * matching memory barrier on interrupt delivery between writing
1052 * IRR and reading IsRunning; the lack of this barrier might be
1053 * the cause of errata #1235).
1055 avic_vcpu_put(vcpu);
1058 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
1060 if (!kvm_vcpu_apicv_active(vcpu))
1063 avic_vcpu_load(vcpu, vcpu->cpu);
1068 * - The module param avic enable both xAPIC and x2APIC mode.
1069 * - Hypervisor can support both xAVIC and x2AVIC in the same guest.
1070 * - The mode can be switched at run-time.
1072 bool avic_hardware_setup(struct kvm_x86_ops *x86_ops)
1077 if (boot_cpu_has(X86_FEATURE_AVIC)) {
1078 avic_mode = AVIC_MODE_X1;
1079 pr_info("AVIC enabled\n");
1080 } else if (force_avic) {
1082 * Some older systems does not advertise AVIC support.
1083 * See Revision Guide for specific AMD processor for more detail.
1085 avic_mode = AVIC_MODE_X1;
1086 pr_warn("AVIC is not supported in CPUID but force enabled");
1087 pr_warn("Your system might crash and burn");
1090 /* AVIC is a prerequisite for x2AVIC. */
1091 if (boot_cpu_has(X86_FEATURE_X2AVIC)) {
1092 if (avic_mode == AVIC_MODE_X1) {
1093 avic_mode = AVIC_MODE_X2;
1094 pr_info("x2AVIC enabled\n");
1096 pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled");
1097 pr_warn(FW_BUG "Try enable AVIC using force_avic option");
1101 if (avic_mode != AVIC_MODE_NONE)
1102 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);