Merge tag 'hyperv-next-signed-20210216' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 21 Feb 2021 21:24:39 +0000 (13:24 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 21 Feb 2021 21:24:39 +0000 (13:24 -0800)
Pull Hyper-V updates from Wei Liu:

 - VMBus hardening patches from Andrea Parri and Andres Beltran.

 - Patches to make Linux boot as the root partition on Microsoft
   Hypervisor from Wei Liu.

 - One patch to add a new sysfs interface to support hibernation on
   Hyper-V from Dexuan Cui.

 - Two miscellaneous clean-up patches from Colin and Gustavo.

* tag 'hyperv-next-signed-20210216' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux: (31 commits)
  Revert "Drivers: hv: vmbus: Copy packets sent by Hyper-V out of the ring buffer"
  iommu/hyperv: setup an IO-APIC IRQ remapping domain for root partition
  x86/hyperv: implement an MSI domain for root partition
  asm-generic/hyperv: import data structures for mapping device interrupts
  asm-generic/hyperv: introduce hv_device_id and auxiliary structures
  asm-generic/hyperv: update hv_interrupt_entry
  asm-generic/hyperv: update hv_msi_entry
  x86/hyperv: implement and use hv_smp_prepare_cpus
  x86/hyperv: provide a bunch of helper functions
  ACPI / NUMA: add a stub function for node_to_pxm()
  x86/hyperv: handling hypercall page setup for root
  x86/hyperv: extract partition ID from Microsoft Hypervisor if necessary
  x86/hyperv: allocate output arg pages if required
  clocksource/hyperv: use MSR-based access if running as root
  Drivers: hv: vmbus: skip VMBus initialization if Linux is root
  x86/hyperv: detect if Linux is the root partition
  asm-generic/hyperv: change HV_CPU_POWER_MANAGEMENT to HV_CPU_MANAGEMENT
  hv: hyperv.h: Replace one-element array with flexible-array in struct icmsg_negotiate
  hv_netvsc: Restrict configurations on isolated guests
  Drivers: hv: vmbus: Enforce 'VMBus version >= 5.2' on isolated guests
  ...

24 files changed:
Documentation/ABI/stable/sysfs-bus-vmbus
arch/x86/hyperv/Makefile
arch/x86/hyperv/hv_init.c
arch/x86/hyperv/hv_proc.c [new file with mode: 0644]
arch/x86/hyperv/irqdomain.c [new file with mode: 0644]
arch/x86/include/asm/hyperv-tlfs.h
arch/x86/include/asm/mshyperv.h
arch/x86/kernel/cpu/mshyperv.c
drivers/clocksource/hyperv_timer.c
drivers/hv/channel.c
drivers/hv/channel_mgmt.c
drivers/hv/connection.c
drivers/hv/hv_fcopy.c
drivers/hv/hv_kvp.c
drivers/hv/hv_snapshot.c
drivers/hv/hv_util.c
drivers/hv/vmbus_drv.c
drivers/iommu/hyperv-iommu.c
drivers/net/hyperv/netvsc.c
drivers/pci/controller/pci-hyperv.c
include/acpi/acpi_numa.h
include/asm-generic/hyperv-tlfs.h
include/asm-generic/mshyperv.h
include/linux/hyperv.h

index c27b7b8..42599d9 100644 (file)
@@ -1,3 +1,10 @@
+What:          /sys/bus/vmbus/hibernation
+Date:          Jan 2021
+KernelVersion: 5.12
+Contact:       Dexuan Cui <decui@microsoft.com>
+Description:   Whether the host supports hibernation for the VM.
+Users:         Daemon that sets up swap partition/file for hibernation.
+
 What:          /sys/bus/vmbus/devices/<UUID>/id
 Date:          Jul 2009
 KernelVersion: 2.6.31
index 89b1f74..48e2c51 100644 (file)
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
-obj-y                  := hv_init.o mmu.o nested.o
-obj-$(CONFIG_X86_64)   += hv_apic.o
+obj-y                  := hv_init.o mmu.o nested.o irqdomain.o
+obj-$(CONFIG_X86_64)   += hv_apic.o hv_proc.o
 
 ifdef CONFIG_X86_64
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)       += hv_spinlock.o
index 6375967..b81047d 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/acpi.h>
 #include <linux/efi.h>
 #include <linux/types.h>
+#include <linux/bitfield.h>
 #include <asm/apic.h>
 #include <asm/desc.h>
 #include <asm/hypervisor.h>
 #include <linux/cpuhotplug.h>
 #include <linux/syscore_ops.h>
 #include <clocksource/hyperv_timer.h>
+#include <linux/highmem.h>
 
 int hyperv_init_cpuhp;
+u64 hv_current_partition_id = ~0ull;
+EXPORT_SYMBOL_GPL(hv_current_partition_id);
 
 void *hv_hypercall_pg;
 EXPORT_SYMBOL_GPL(hv_hypercall_pg);
@@ -44,6 +48,9 @@ EXPORT_SYMBOL_GPL(hv_vp_assist_page);
 void  __percpu **hyperv_pcpu_input_arg;
 EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg);
 
+void  __percpu **hyperv_pcpu_output_arg;
+EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg);
+
 u32 hv_max_vp_index;
 EXPORT_SYMBOL_GPL(hv_max_vp_index);
 
@@ -76,12 +83,19 @@ static int hv_cpu_init(unsigned int cpu)
        void **input_arg;
        struct page *pg;
 
-       input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
        /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
-       pg = alloc_page(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL);
+       pg = alloc_pages(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL, hv_root_partition ? 1 : 0);
        if (unlikely(!pg))
                return -ENOMEM;
+
+       input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
        *input_arg = page_address(pg);
+       if (hv_root_partition) {
+               void **output_arg;
+
+               output_arg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
+               *output_arg = page_address(pg + 1);
+       }
 
        hv_get_vp_index(msr_vp_index);
 
@@ -208,14 +222,23 @@ static int hv_cpu_die(unsigned int cpu)
        unsigned int new_cpu;
        unsigned long flags;
        void **input_arg;
-       void *input_pg = NULL;
+       void *pg;
 
        local_irq_save(flags);
        input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
-       input_pg = *input_arg;
+       pg = *input_arg;
        *input_arg = NULL;
+
+       if (hv_root_partition) {
+               void **output_arg;
+
+               output_arg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
+               *output_arg = NULL;
+       }
+
        local_irq_restore(flags);
-       free_page((unsigned long)input_pg);
+
+       free_pages((unsigned long)pg, hv_root_partition ? 1 : 0);
 
        if (hv_vp_assist_page && hv_vp_assist_page[cpu])
                wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0);
@@ -264,6 +287,9 @@ static int hv_suspend(void)
        union hv_x64_msr_hypercall_contents hypercall_msr;
        int ret;
 
+       if (hv_root_partition)
+               return -EPERM;
+
        /*
         * Reset the hypercall page as it is going to be invalidated
         * accross hibernation. Setting hv_hypercall_pg to NULL ensures
@@ -334,6 +360,24 @@ static void __init hv_stimer_setup_percpu_clockev(void)
                old_setup_percpu_clockev();
 }
 
+static void __init hv_get_partition_id(void)
+{
+       struct hv_get_partition_id *output_page;
+       u64 status;
+       unsigned long flags;
+
+       local_irq_save(flags);
+       output_page = *this_cpu_ptr(hyperv_pcpu_output_arg);
+       status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output_page);
+       if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
+               /* No point in proceeding if this failed */
+               pr_err("Failed to get partition ID: %lld\n", status);
+               BUG();
+       }
+       hv_current_partition_id = output_page->partition_id;
+       local_irq_restore(flags);
+}
+
 /*
  * This function is to be invoked early in the boot sequence after the
  * hypervisor has been detected.
@@ -368,6 +412,12 @@ void __init hyperv_init(void)
 
        BUG_ON(hyperv_pcpu_input_arg == NULL);
 
+       /* Allocate the per-CPU state for output arg for root */
+       if (hv_root_partition) {
+               hyperv_pcpu_output_arg = alloc_percpu(void *);
+               BUG_ON(hyperv_pcpu_output_arg == NULL);
+       }
+
        /* Allocate percpu VP index */
        hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index),
                                    GFP_KERNEL);
@@ -408,8 +458,35 @@ void __init hyperv_init(void)
 
        rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
        hypercall_msr.enable = 1;
-       hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
-       wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+
+       if (hv_root_partition) {
+               struct page *pg;
+               void *src, *dst;
+
+               /*
+                * For the root partition, the hypervisor will set up its
+                * hypercall page. The hypervisor guarantees it will not show
+                * up in the root's address space. The root can't change the
+                * location of the hypercall page.
+                *
+                * Order is important here. We must enable the hypercall page
+                * so it is populated with code, then copy the code to an
+                * executable page.
+                */
+               wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+
+               pg = vmalloc_to_page(hv_hypercall_pg);
+               dst = kmap(pg);
+               src = memremap(hypercall_msr.guest_physical_address << PAGE_SHIFT, PAGE_SIZE,
+                               MEMREMAP_WB);
+               BUG_ON(!(src && dst));
+               memcpy(dst, src, HV_HYP_PAGE_SIZE);
+               memunmap(src);
+               kunmap(pg);
+       } else {
+               hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
+               wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+       }
 
        /*
         * hyperv_init() is called before LAPIC is initialized: see
@@ -428,6 +505,21 @@ void __init hyperv_init(void)
        register_syscore_ops(&hv_syscore_ops);
 
        hyperv_init_cpuhp = cpuhp;
+
+       if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_ACCESS_PARTITION_ID)
+               hv_get_partition_id();
+
+       BUG_ON(hv_root_partition && hv_current_partition_id == ~0ull);
+
+#ifdef CONFIG_PCI_MSI
+       /*
+        * If we're running as root, we want to create our own PCI MSI domain.
+        * We can't set this in hv_pci_init because that would be too late.
+        */
+       if (hv_root_partition)
+               x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain;
+#endif
+
        return;
 
 remove_cpuhp_state:
@@ -552,6 +644,20 @@ EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized);
 
 bool hv_is_hibernation_supported(void)
 {
-       return acpi_sleep_state_supported(ACPI_STATE_S4);
+       return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4);
 }
 EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
+
+enum hv_isolation_type hv_get_isolation_type(void)
+{
+       if (!(ms_hyperv.features_b & HV_ISOLATION))
+               return HV_ISOLATION_TYPE_NONE;
+       return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b);
+}
+EXPORT_SYMBOL_GPL(hv_get_isolation_type);
+
+bool hv_is_isolation_supported(void)
+{
+       return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
+}
+EXPORT_SYMBOL_GPL(hv_is_isolation_supported);
diff --git a/arch/x86/hyperv/hv_proc.c b/arch/x86/hyperv/hv_proc.c
new file mode 100644 (file)
index 0000000..60461e5
--- /dev/null
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/clockchips.h>
+#include <linux/acpi.h>
+#include <linux/hyperv.h>
+#include <linux/slab.h>
+#include <linux/cpuhotplug.h>
+#include <linux/minmax.h>
+#include <asm/hypervisor.h>
+#include <asm/mshyperv.h>
+#include <asm/apic.h>
+
+#include <asm/trace/hyperv.h>
+
+/*
+ * See struct hv_deposit_memory. The first u64 is partition ID, the rest
+ * are GPAs.
+ */
+#define HV_DEPOSIT_MAX (HV_HYP_PAGE_SIZE / sizeof(u64) - 1)
+
+/* Deposits exact number of pages. Must be called with interrupts enabled.  */
+int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
+{
+       struct page **pages, *page;
+       int *counts;
+       int num_allocations;
+       int i, j, page_count;
+       int order;
+       u64 status;
+       int ret;
+       u64 base_pfn;
+       struct hv_deposit_memory *input_page;
+       unsigned long flags;
+
+       if (num_pages > HV_DEPOSIT_MAX)
+               return -E2BIG;
+       if (!num_pages)
+               return 0;
+
+       /* One buffer for page pointers and counts */
+       page = alloc_page(GFP_KERNEL);
+       if (!page)
+               return -ENOMEM;
+       pages = page_address(page);
+
+       counts = kcalloc(HV_DEPOSIT_MAX, sizeof(int), GFP_KERNEL);
+       if (!counts) {
+               free_page((unsigned long)pages);
+               return -ENOMEM;
+       }
+
+       /* Allocate all the pages before disabling interrupts */
+       i = 0;
+
+       while (num_pages) {
+               /* Find highest order we can actually allocate */
+               order = 31 - __builtin_clz(num_pages);
+
+               while (1) {
+                       pages[i] = alloc_pages_node(node, GFP_KERNEL, order);
+                       if (pages[i])
+                               break;
+                       if (!order) {
+                               ret = -ENOMEM;
+                               num_allocations = i;
+                               goto err_free_allocations;
+                       }
+                       --order;
+               }
+
+               split_page(pages[i], order);
+               counts[i] = 1 << order;
+               num_pages -= counts[i];
+               i++;
+       }
+       num_allocations = i;
+
+       local_irq_save(flags);
+
+       input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+       input_page->partition_id = partition_id;
+
+       /* Populate gpa_page_list - these will fit on the input page */
+       for (i = 0, page_count = 0; i < num_allocations; ++i) {
+               base_pfn = page_to_pfn(pages[i]);
+               for (j = 0; j < counts[i]; ++j, ++page_count)
+                       input_page->gpa_page_list[page_count] = base_pfn + j;
+       }
+       status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY,
+                                    page_count, 0, input_page, NULL);
+       local_irq_restore(flags);
+
+       if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
+               pr_err("Failed to deposit pages: %lld\n", status);
+               ret = status;
+               goto err_free_allocations;
+       }
+
+       ret = 0;
+       goto free_buf;
+
+err_free_allocations:
+       for (i = 0; i < num_allocations; ++i) {
+               base_pfn = page_to_pfn(pages[i]);
+               for (j = 0; j < counts[i]; ++j)
+                       __free_page(pfn_to_page(base_pfn + j));
+       }
+
+free_buf:
+       free_page((unsigned long)pages);
+       kfree(counts);
+       return ret;
+}
+
+int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
+{
+       struct hv_add_logical_processor_in *input;
+       struct hv_add_logical_processor_out *output;
+       u64 status;
+       unsigned long flags;
+       int ret = 0;
+       int pxm = node_to_pxm(node);
+
+       /*
+        * When adding a logical processor, the hypervisor may return
+        * HV_STATUS_INSUFFICIENT_MEMORY. When that happens, we deposit more
+        * pages and retry.
+        */
+       do {
+               local_irq_save(flags);
+
+               input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+               /* We don't do anything with the output right now */
+               output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+               input->lp_index = lp_index;
+               input->apic_id = apic_id;
+               input->flags = 0;
+               input->proximity_domain_info.domain_id = pxm;
+               input->proximity_domain_info.flags.reserved = 0;
+               input->proximity_domain_info.flags.proximity_info_valid = 1;
+               input->proximity_domain_info.flags.proximity_preferred = 1;
+               status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR,
+                                        input, output);
+               local_irq_restore(flags);
+
+               status &= HV_HYPERCALL_RESULT_MASK;
+
+               if (status != HV_STATUS_INSUFFICIENT_MEMORY) {
+                       if (status != HV_STATUS_SUCCESS) {
+                               pr_err("%s: cpu %u apic ID %u, %lld\n", __func__,
+                                      lp_index, apic_id, status);
+                               ret = status;
+                       }
+                       break;
+               }
+               ret = hv_call_deposit_pages(node, hv_current_partition_id, 1);
+       } while (!ret);
+
+       return ret;
+}
+
+int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
+{
+       struct hv_create_vp *input;
+       u64 status;
+       unsigned long irq_flags;
+       int ret = 0;
+       int pxm = node_to_pxm(node);
+
+       /* Root VPs don't seem to need pages deposited */
+       if (partition_id != hv_current_partition_id) {
+               /* The value 90 is empirically determined. It may change. */
+               ret = hv_call_deposit_pages(node, partition_id, 90);
+               if (ret)
+                       return ret;
+       }
+
+       do {
+               local_irq_save(irq_flags);
+
+               input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+               input->partition_id = partition_id;
+               input->vp_index = vp_index;
+               input->flags = flags;
+               input->subnode_type = HvSubnodeAny;
+               if (node != NUMA_NO_NODE) {
+                       input->proximity_domain_info.domain_id = pxm;
+                       input->proximity_domain_info.flags.reserved = 0;
+                       input->proximity_domain_info.flags.proximity_info_valid = 1;
+                       input->proximity_domain_info.flags.proximity_preferred = 1;
+               } else {
+                       input->proximity_domain_info.as_uint64 = 0;
+               }
+               status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL);
+               local_irq_restore(irq_flags);
+
+               status &= HV_HYPERCALL_RESULT_MASK;
+
+               if (status != HV_STATUS_INSUFFICIENT_MEMORY) {
+                       if (status != HV_STATUS_SUCCESS) {
+                               pr_err("%s: vcpu %u, lp %u, %lld\n", __func__,
+                                      vp_index, flags, status);
+                               ret = status;
+                       }
+                       break;
+               }
+               ret = hv_call_deposit_pages(node, partition_id, 1);
+
+       } while (!ret);
+
+       return ret;
+}
+
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
new file mode 100644 (file)
index 0000000..4421a8d
--- /dev/null
@@ -0,0 +1,385 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Irqdomain for Linux to run as the root partition on Microsoft Hypervisor.
+ *
+ * Authors:
+ *  Sunil Muthuswamy <sunilmut@microsoft.com>
+ *  Wei Liu <wei.liu@kernel.org>
+ */
+
+#include <linux/pci.h>
+#include <linux/irq.h>
+#include <asm/mshyperv.h>
+
+static int hv_map_interrupt(union hv_device_id device_id, bool level,
+               int cpu, int vector, struct hv_interrupt_entry *entry)
+{
+       struct hv_input_map_device_interrupt *input;
+       struct hv_output_map_device_interrupt *output;
+       struct hv_device_interrupt_descriptor *intr_desc;
+       unsigned long flags;
+       u64 status;
+       int nr_bank, var_size;
+
+       local_irq_save(flags);
+
+       input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+       output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+       intr_desc = &input->interrupt_descriptor;
+       memset(input, 0, sizeof(*input));
+       input->partition_id = hv_current_partition_id;
+       input->device_id = device_id.as_uint64;
+       intr_desc->interrupt_type = HV_X64_INTERRUPT_TYPE_FIXED;
+       intr_desc->vector_count = 1;
+       intr_desc->target.vector = vector;
+
+       if (level)
+               intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_LEVEL;
+       else
+               intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_EDGE;
+
+       intr_desc->target.vp_set.valid_bank_mask = 0;
+       intr_desc->target.vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+       nr_bank = cpumask_to_vpset(&(intr_desc->target.vp_set), cpumask_of(cpu));
+       if (nr_bank < 0) {
+               local_irq_restore(flags);
+               pr_err("%s: unable to generate VP set\n", __func__);
+               return EINVAL;
+       }
+       intr_desc->target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
+
+       /*
+        * var-sized hypercall, var-size starts after vp_mask (thus
+        * vp_set.format does not count, but vp_set.valid_bank_mask
+        * does).
+        */
+       var_size = nr_bank + 1;
+
+       status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_INTERRUPT, 0, var_size,
+                       input, output);
+       *entry = output->interrupt_entry;
+
+       local_irq_restore(flags);
+
+       if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
+               pr_err("%s: hypercall failed, status %lld\n", __func__, status);
+
+       return status & HV_HYPERCALL_RESULT_MASK;
+}
+
+static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry)
+{
+       unsigned long flags;
+       struct hv_input_unmap_device_interrupt *input;
+       struct hv_interrupt_entry *intr_entry;
+       u64 status;
+
+       local_irq_save(flags);
+       input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+       memset(input, 0, sizeof(*input));
+       intr_entry = &input->interrupt_entry;
+       input->partition_id = hv_current_partition_id;
+       input->device_id = id;
+       *intr_entry = *old_entry;
+
+       status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL);
+       local_irq_restore(flags);
+
+       return status & HV_HYPERCALL_RESULT_MASK;
+}
+
+#ifdef CONFIG_PCI_MSI
+struct rid_data {
+       struct pci_dev *bridge;
+       u32 rid;
+};
+
+static int get_rid_cb(struct pci_dev *pdev, u16 alias, void *data)
+{
+       struct rid_data *rd = data;
+       u8 bus = PCI_BUS_NUM(rd->rid);
+
+       if (pdev->bus->number != bus || PCI_BUS_NUM(alias) != bus) {
+               rd->bridge = pdev;
+               rd->rid = alias;
+       }
+
+       return 0;
+}
+
+static union hv_device_id hv_build_pci_dev_id(struct pci_dev *dev)
+{
+       union hv_device_id dev_id;
+       struct rid_data data = {
+               .bridge = NULL,
+               .rid = PCI_DEVID(dev->bus->number, dev->devfn)
+       };
+
+       pci_for_each_dma_alias(dev, get_rid_cb, &data);
+
+       dev_id.as_uint64 = 0;
+       dev_id.device_type = HV_DEVICE_TYPE_PCI;
+       dev_id.pci.segment = pci_domain_nr(dev->bus);
+
+       dev_id.pci.bdf.bus = PCI_BUS_NUM(data.rid);
+       dev_id.pci.bdf.device = PCI_SLOT(data.rid);
+       dev_id.pci.bdf.function = PCI_FUNC(data.rid);
+       dev_id.pci.source_shadow = HV_SOURCE_SHADOW_NONE;
+
+       if (data.bridge) {
+               int pos;
+
+               /*
+                * Microsoft Hypervisor requires a bus range when the bridge is
+                * running in PCI-X mode.
+                *
+                * To distinguish conventional vs PCI-X bridge, we can check
+                * the bridge's PCI-X Secondary Status Register, Secondary Bus
+                * Mode and Frequency bits. See PCI Express to PCI/PCI-X Bridge
+                * Specification Revision 1.0 5.2.2.1.3.
+                *
+                * Value zero means it is in conventional mode, otherwise it is
+                * in PCI-X mode.
+                */
+
+               pos = pci_find_capability(data.bridge, PCI_CAP_ID_PCIX);
+               if (pos) {
+                       u16 status;
+
+                       pci_read_config_word(data.bridge, pos +
+                                       PCI_X_BRIDGE_SSTATUS, &status);
+
+                       if (status & PCI_X_SSTATUS_FREQ) {
+                               /* Non-zero, PCI-X mode */
+                               u8 sec_bus, sub_bus;
+
+                               dev_id.pci.source_shadow = HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE;
+
+                               pci_read_config_byte(data.bridge, PCI_SECONDARY_BUS, &sec_bus);
+                               dev_id.pci.shadow_bus_range.secondary_bus = sec_bus;
+                               pci_read_config_byte(data.bridge, PCI_SUBORDINATE_BUS, &sub_bus);
+                               dev_id.pci.shadow_bus_range.subordinate_bus = sub_bus;
+                       }
+               }
+       }
+
+       return dev_id;
+}
+
+static int hv_map_msi_interrupt(struct pci_dev *dev, int cpu, int vector,
+                               struct hv_interrupt_entry *entry)
+{
+       union hv_device_id device_id = hv_build_pci_dev_id(dev);
+
+       return hv_map_interrupt(device_id, false, cpu, vector, entry);
+}
+
+static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi_msg *msg)
+{
+       /* High address is always 0 */
+       msg->address_hi = 0;
+       msg->address_lo = entry->msi_entry.address.as_uint32;
+       msg->data = entry->msi_entry.data.as_uint32;
+}
+
+static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry);
+static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+       struct msi_desc *msidesc;
+       struct pci_dev *dev;
+       struct hv_interrupt_entry out_entry, *stored_entry;
+       struct irq_cfg *cfg = irqd_cfg(data);
+       cpumask_t *affinity;
+       int cpu;
+       u64 status;
+
+       msidesc = irq_data_get_msi_desc(data);
+       dev = msi_desc_to_pci_dev(msidesc);
+
+       if (!cfg) {
+               pr_debug("%s: cfg is NULL", __func__);
+               return;
+       }
+
+       affinity = irq_data_get_effective_affinity_mask(data);
+       cpu = cpumask_first_and(affinity, cpu_online_mask);
+
+       if (data->chip_data) {
+               /*
+                * This interrupt is already mapped. Let's unmap first.
+                *
+                * We don't use retarget interrupt hypercalls here because
+                * Microsoft Hypervisor doens't allow root to change the vector
+                * or specify VPs outside of the set that is initially used
+                * during mapping.
+                */
+               stored_entry = data->chip_data;
+               data->chip_data = NULL;
+
+               status = hv_unmap_msi_interrupt(dev, stored_entry);
+
+               kfree(stored_entry);
+
+               if (status != HV_STATUS_SUCCESS) {
+                       pr_debug("%s: failed to unmap, status %lld", __func__, status);
+                       return;
+               }
+       }
+
+       stored_entry = kzalloc(sizeof(*stored_entry), GFP_ATOMIC);
+       if (!stored_entry) {
+               pr_debug("%s: failed to allocate chip data\n", __func__);
+               return;
+       }
+
+       status = hv_map_msi_interrupt(dev, cpu, cfg->vector, &out_entry);
+       if (status != HV_STATUS_SUCCESS) {
+               kfree(stored_entry);
+               return;
+       }
+
+       *stored_entry = out_entry;
+       data->chip_data = stored_entry;
+       entry_to_msi_msg(&out_entry, msg);
+
+       return;
+}
+
+static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry)
+{
+       return hv_unmap_interrupt(hv_build_pci_dev_id(dev).as_uint64, old_entry);
+}
+
+static void hv_teardown_msi_irq_common(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
+{
+       u64 status;
+       struct hv_interrupt_entry old_entry;
+       struct irq_desc *desc;
+       struct irq_data *data;
+       struct msi_msg msg;
+
+       desc = irq_to_desc(irq);
+       if (!desc) {
+               pr_debug("%s: no irq desc\n", __func__);
+               return;
+       }
+
+       data = &desc->irq_data;
+       if (!data) {
+               pr_debug("%s: no irq data\n", __func__);
+               return;
+       }
+
+       if (!data->chip_data) {
+               pr_debug("%s: no chip data\n!", __func__);
+               return;
+       }
+
+       old_entry = *(struct hv_interrupt_entry *)data->chip_data;
+       entry_to_msi_msg(&old_entry, &msg);
+
+       kfree(data->chip_data);
+       data->chip_data = NULL;
+
+       status = hv_unmap_msi_interrupt(dev, &old_entry);
+
+       if (status != HV_STATUS_SUCCESS) {
+               pr_err("%s: hypercall failed, status %lld\n", __func__, status);
+               return;
+       }
+}
+
+static void hv_msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+{
+       int i;
+       struct msi_desc *entry;
+       struct pci_dev *pdev;
+
+       if (WARN_ON_ONCE(!dev_is_pci(dev)))
+               return;
+
+       pdev = to_pci_dev(dev);
+
+       for_each_pci_msi_entry(entry, pdev) {
+               if (entry->irq) {
+                       for (i = 0; i < entry->nvec_used; i++) {
+                               hv_teardown_msi_irq_common(pdev, entry, entry->irq + i);
+                               irq_domain_free_irqs(entry->irq + i, 1);
+                       }
+               }
+       }
+}
+
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip hv_pci_msi_controller = {
+       .name                   = "HV-PCI-MSI",
+       .irq_unmask             = pci_msi_unmask_irq,
+       .irq_mask               = pci_msi_mask_irq,
+       .irq_ack                = irq_chip_ack_parent,
+       .irq_retrigger          = irq_chip_retrigger_hierarchy,
+       .irq_compose_msi_msg    = hv_irq_compose_msi_msg,
+       .irq_set_affinity       = msi_domain_set_affinity,
+       .flags                  = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static struct msi_domain_ops pci_msi_domain_ops = {
+       .domain_free_irqs       = hv_msi_domain_free_irqs,
+       .msi_prepare            = pci_msi_prepare,
+};
+
+static struct msi_domain_info hv_pci_msi_domain_info = {
+       .flags          = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+                         MSI_FLAG_PCI_MSIX,
+       .ops            = &pci_msi_domain_ops,
+       .chip           = &hv_pci_msi_controller,
+       .handler        = handle_edge_irq,
+       .handler_name   = "edge",
+};
+
+struct irq_domain * __init hv_create_pci_msi_domain(void)
+{
+       struct irq_domain *d = NULL;
+       struct fwnode_handle *fn;
+
+       fn = irq_domain_alloc_named_fwnode("HV-PCI-MSI");
+       if (fn)
+               d = pci_msi_create_irq_domain(fn, &hv_pci_msi_domain_info, x86_vector_domain);
+
+       /* No point in going further if we can't get an irq domain */
+       BUG_ON(!d);
+
+       return d;
+}
+
+#endif /* CONFIG_PCI_MSI */
+
+int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry)
+{
+       union hv_device_id device_id;
+
+       device_id.as_uint64 = 0;
+       device_id.device_type = HV_DEVICE_TYPE_IOAPIC;
+       device_id.ioapic.ioapic_id = (u8)ioapic_id;
+
+       return hv_unmap_interrupt(device_id.as_uint64, entry);
+}
+EXPORT_SYMBOL_GPL(hv_unmap_ioapic_interrupt);
+
+int hv_map_ioapic_interrupt(int ioapic_id, bool level, int cpu, int vector,
+               struct hv_interrupt_entry *entry)
+{
+       union hv_device_id device_id;
+
+       device_id.as_uint64 = 0;
+       device_id.device_type = HV_DEVICE_TYPE_IOAPIC;
+       device_id.ioapic.ioapic_id = (u8)ioapic_id;
+
+       return hv_map_interrupt(device_id, level, cpu, vector, entry);
+}
+EXPORT_SYMBOL_GPL(hv_map_ioapic_interrupt);
index 6bf42ae..e6cd3fe 100644 (file)
@@ -21,7 +21,9 @@
 #define HYPERV_CPUID_FEATURES                  0x40000003
 #define HYPERV_CPUID_ENLIGHTMENT_INFO          0x40000004
 #define HYPERV_CPUID_IMPLEMENT_LIMITS          0x40000005
+#define HYPERV_CPUID_CPU_MANAGEMENT_FEATURES   0x40000007
 #define HYPERV_CPUID_NESTED_FEATURES           0x4000000A
+#define HYPERV_CPUID_ISOLATION_CONFIG          0x4000000C
 
 #define HYPERV_CPUID_VIRT_STACK_INTERFACE      0x40000081
 #define HYPERV_VS_INTERFACE_EAX_SIGNATURE      0x31235356  /* "VS#1" */
 /* Recommend using enlightened VMCS */
 #define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED            BIT(14)
 
+/*
+ * CPU management features identification.
+ * These are HYPERV_CPUID_CPU_MANAGEMENT_FEATURES.EAX bits.
+ */
+#define HV_X64_START_LOGICAL_PROCESSOR                 BIT(0)
+#define HV_X64_CREATE_ROOT_VIRTUAL_PROCESSOR           BIT(1)
+#define HV_X64_PERFORMANCE_COUNTER_SYNC                        BIT(2)
+#define HV_X64_RESERVED_IDENTITY_BIT                   BIT(31)
+
 /*
  * Virtual processor will never share a physical core with another virtual
  * processor, except for virtual processors that are reported as sibling SMT
 #define HV_X64_NESTED_GUEST_MAPPING_FLUSH              BIT(18)
 #define HV_X64_NESTED_MSR_BITMAP                       BIT(19)
 
+/* HYPERV_CPUID_ISOLATION_CONFIG.EAX bits. */
+#define HV_PARAVISOR_PRESENT                           BIT(0)
+
+/* HYPERV_CPUID_ISOLATION_CONFIG.EBX bits. */
+#define HV_ISOLATION_TYPE                              GENMASK(3, 0)
+#define HV_SHARED_GPA_BOUNDARY_ACTIVE                  BIT(5)
+#define HV_SHARED_GPA_BOUNDARY_BITS                    GENMASK(11, 6)
+
+enum hv_isolation_type {
+       HV_ISOLATION_TYPE_NONE  = 0,
+       HV_ISOLATION_TYPE_VBS   = 1,
+       HV_ISOLATION_TYPE_SNP   = 2
+};
+
 /* Hyper-V specific model specific registers (MSRs) */
 
 /* MSR used to identify the guest OS. */
@@ -523,6 +548,19 @@ struct hv_partition_assist_pg {
        u32 tlb_lock_count;
 };
 
+enum hv_interrupt_type {
+       HV_X64_INTERRUPT_TYPE_FIXED             = 0x0000,
+       HV_X64_INTERRUPT_TYPE_LOWESTPRIORITY    = 0x0001,
+       HV_X64_INTERRUPT_TYPE_SMI               = 0x0002,
+       HV_X64_INTERRUPT_TYPE_REMOTEREAD        = 0x0003,
+       HV_X64_INTERRUPT_TYPE_NMI               = 0x0004,
+       HV_X64_INTERRUPT_TYPE_INIT              = 0x0005,
+       HV_X64_INTERRUPT_TYPE_SIPI              = 0x0006,
+       HV_X64_INTERRUPT_TYPE_EXTINT            = 0x0007,
+       HV_X64_INTERRUPT_TYPE_LOCALINT0         = 0x0008,
+       HV_X64_INTERRUPT_TYPE_LOCALINT1         = 0x0009,
+       HV_X64_INTERRUPT_TYPE_MAXIMUM           = 0x000A,
+};
 
 #include <asm-generic/hyperv-tlfs.h>
 
index 30f76b9..ccf60a8 100644 (file)
@@ -78,6 +78,13 @@ extern int hyperv_init_cpuhp;
 
 extern void *hv_hypercall_pg;
 extern void  __percpu  **hyperv_pcpu_input_arg;
+extern void  __percpu  **hyperv_pcpu_output_arg;
+
+extern u64 hv_current_partition_id;
+
+int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
+int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
+int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
 
 static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
 {
@@ -239,6 +246,8 @@ int hyperv_fill_flush_guest_mapping_list(
                struct hv_guest_mapping_flush_list *flush,
                u64 start_gfn, u64 end_gfn);
 
+extern bool hv_root_partition;
+
 #ifdef CONFIG_X86_64
 void hv_apic_init(void);
 void __init hv_init_spinlocks(void);
@@ -250,10 +259,16 @@ static inline void hv_apic_init(void) {}
 static inline void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry,
                                              struct msi_desc *msi_desc)
 {
-       msi_entry->address = msi_desc->msg.address_lo;
-       msi_entry->data = msi_desc->msg.data;
+       msi_entry->address.as_uint32 = msi_desc->msg.address_lo;
+       msi_entry->data.as_uint32 = msi_desc->msg.data;
 }
 
+struct irq_domain *hv_create_pci_msi_domain(void);
+
+int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
+               struct hv_interrupt_entry *entry);
+int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
+
 #else /* CONFIG_HYPERV */
 static inline void hyperv_init(void) {}
 static inline void hyperv_setup_mmu_ops(void) {}
index 43b54be..e88bc29 100644 (file)
 #include <asm/reboot.h>
 #include <asm/nmi.h>
 #include <clocksource/hyperv_timer.h>
+#include <asm/numa.h>
+
+/* Is Linux running as the root partition? */
+bool hv_root_partition;
+EXPORT_SYMBOL_GPL(hv_root_partition);
 
 struct ms_hyperv_info ms_hyperv;
 EXPORT_SYMBOL_GPL(ms_hyperv);
@@ -226,6 +231,32 @@ static void __init hv_smp_prepare_boot_cpu(void)
        hv_init_spinlocks();
 #endif
 }
+
+static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
+{
+#ifdef CONFIG_X86_64
+       int i;
+       int ret;
+#endif
+
+       native_smp_prepare_cpus(max_cpus);
+
+#ifdef CONFIG_X86_64
+       for_each_present_cpu(i) {
+               if (i == 0)
+                       continue;
+               ret = hv_call_add_logical_proc(numa_cpu_node(i), i, cpu_physical_id(i));
+               BUG_ON(ret);
+       }
+
+       for_each_present_cpu(i) {
+               if (i == 0)
+                       continue;
+               ret = hv_call_create_vp(numa_cpu_node(i), hv_current_partition_id, i, i);
+               BUG_ON(ret);
+       }
+#endif
+}
 #endif
 
 static void __init ms_hyperv_init_platform(void)
@@ -243,6 +274,7 @@ static void __init ms_hyperv_init_platform(void)
         * Extract the features and hints
         */
        ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+       ms_hyperv.features_b = cpuid_ebx(HYPERV_CPUID_FEATURES);
        ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
        ms_hyperv.hints    = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
 
@@ -255,6 +287,22 @@ static void __init ms_hyperv_init_platform(void)
        pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n",
                 ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
 
+       /*
+        * Check CPU management privilege.
+        *
+        * To mirror what Windows does we should extract CPU management
+        * features and use the ReservedIdentityBit to detect if Linux is the
+        * root partition. But that requires negotiating CPU management
+        * interface (a process to be finalized).
+        *
+        * For now, use the privilege flag as the indicator for running as
+        * root.
+        */
+       if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_CPU_MANAGEMENT) {
+               hv_root_partition = true;
+               pr_info("Hyper-V: running as root partition\n");
+       }
+
        /*
         * Extract host information.
         */
@@ -277,6 +325,14 @@ static void __init ms_hyperv_init_platform(void)
                x86_platform.calibrate_cpu = hv_get_tsc_khz;
        }
 
+       if (ms_hyperv.features_b & HV_ISOLATION) {
+               ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG);
+               ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG);
+
+               pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
+                       ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
+       }
+
        if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) {
                ms_hyperv.nested_features =
                        cpuid_eax(HYPERV_CPUID_NESTED_FEATURES);
@@ -366,6 +422,8 @@ static void __init ms_hyperv_init_platform(void)
 
 # ifdef CONFIG_SMP
        smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
+       if (hv_root_partition)
+               smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
 # endif
 
        /*
index ba04cb3..269a691 100644 (file)
@@ -426,6 +426,9 @@ static bool __init hv_init_tsc_clocksource(void)
        if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
                return false;
 
+       if (hv_root_partition)
+               return false;
+
        hv_read_reference_counter = read_hv_clock_tsc;
        phys_addr = virt_to_phys(hv_get_tsc_page());
 
index 6fb0c76..0bd202d 100644 (file)
@@ -618,7 +618,7 @@ static int __vmbus_open(struct vmbus_channel *newchannel,
                goto error_clean_ring;
 
        /* Create and init the channel open message */
-       open_info = kmalloc(sizeof(*open_info) +
+       open_info = kzalloc(sizeof(*open_info) +
                           sizeof(struct vmbus_channel_open_channel),
                           GFP_KERNEL);
        if (!open_info) {
@@ -745,7 +745,7 @@ int vmbus_teardown_gpadl(struct vmbus_channel *channel, u32 gpadl_handle)
        unsigned long flags;
        int ret;
 
-       info = kmalloc(sizeof(*info) +
+       info = kzalloc(sizeof(*info) +
                       sizeof(struct vmbus_channel_gpadl_teardown), GFP_KERNEL);
        if (!info)
                return -ENOMEM;
index 1d44bb6..f0ed730 100644 (file)
@@ -31,101 +31,118 @@ const struct vmbus_device vmbus_devs[] = {
        { .dev_type = HV_IDE,
          HV_IDE_GUID,
          .perf_device = true,
+         .allowed_in_isolated = false,
        },
 
        /* SCSI */
        { .dev_type = HV_SCSI,
          HV_SCSI_GUID,
          .perf_device = true,
+         .allowed_in_isolated = true,
        },
 
        /* Fibre Channel */
        { .dev_type = HV_FC,
          HV_SYNTHFC_GUID,
          .perf_device = true,
+         .allowed_in_isolated = false,
        },
 
        /* Synthetic NIC */
        { .dev_type = HV_NIC,
          HV_NIC_GUID,
          .perf_device = true,
+         .allowed_in_isolated = true,
        },
 
        /* Network Direct */
        { .dev_type = HV_ND,
          HV_ND_GUID,
          .perf_device = true,
+         .allowed_in_isolated = false,
        },
 
        /* PCIE */
        { .dev_type = HV_PCIE,
          HV_PCIE_GUID,
          .perf_device = false,
+         .allowed_in_isolated = false,
        },
 
        /* Synthetic Frame Buffer */
        { .dev_type = HV_FB,
          HV_SYNTHVID_GUID,
          .perf_device = false,
+         .allowed_in_isolated = false,
        },
 
        /* Synthetic Keyboard */
        { .dev_type = HV_KBD,
          HV_KBD_GUID,
          .perf_device = false,
+         .allowed_in_isolated = false,
        },
 
        /* Synthetic MOUSE */
        { .dev_type = HV_MOUSE,
          HV_MOUSE_GUID,
          .perf_device = false,
+         .allowed_in_isolated = false,
        },
 
        /* KVP */
        { .dev_type = HV_KVP,
          HV_KVP_GUID,
          .perf_device = false,
+         .allowed_in_isolated = false,
        },
 
        /* Time Synch */
        { .dev_type = HV_TS,
          HV_TS_GUID,
          .perf_device = false,
+         .allowed_in_isolated = true,
        },
 
        /* Heartbeat */
        { .dev_type = HV_HB,
          HV_HEART_BEAT_GUID,
          .perf_device = false,
+         .allowed_in_isolated = true,
        },
 
        /* Shutdown */
        { .dev_type = HV_SHUTDOWN,
          HV_SHUTDOWN_GUID,
          .perf_device = false,
+         .allowed_in_isolated = true,
        },
 
        /* File copy */
        { .dev_type = HV_FCOPY,
          HV_FCOPY_GUID,
          .perf_device = false,
+         .allowed_in_isolated = false,
        },
 
        /* Backup */
        { .dev_type = HV_BACKUP,
          HV_VSS_GUID,
          .perf_device = false,
+         .allowed_in_isolated = false,
        },
 
        /* Dynamic Memory */
        { .dev_type = HV_DM,
          HV_DM_GUID,
          .perf_device = false,
+         .allowed_in_isolated = false,
        },
 
        /* Unknown GUID */
        { .dev_type = HV_UNKNOWN,
          .perf_device = false,
+         .allowed_in_isolated = false,
        },
 };
 
@@ -190,6 +207,7 @@ static u16 hv_get_dev_type(const struct vmbus_channel *channel)
  * vmbus_prep_negotiate_resp() - Create default response for Negotiate message
  * @icmsghdrp: Pointer to msg header structure
  * @buf: Raw buffer channel data
+ * @buflen: Length of the raw buffer channel data.
  * @fw_version: The framework versions we can support.
  * @fw_vercnt: The size of @fw_version.
  * @srv_version: The service versions we can support.
@@ -202,8 +220,8 @@ static u16 hv_get_dev_type(const struct vmbus_channel *channel)
  * Set up and fill in default negotiate response message.
  * Mainly used by Hyper-V drivers.
  */
-bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp,
-                               u8 *buf, const int *fw_version, int fw_vercnt,
+bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf,
+                               u32 buflen, const int *fw_version, int fw_vercnt,
                                const int *srv_version, int srv_vercnt,
                                int *nego_fw_version, int *nego_srv_version)
 {
@@ -215,10 +233,14 @@ bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp,
        bool found_match = false;
        struct icmsg_negotiate *negop;
 
+       /* Check that there's enough space for icframe_vercnt, icmsg_vercnt */
+       if (buflen < ICMSG_HDR + offsetof(struct icmsg_negotiate, reserved)) {
+               pr_err_ratelimited("Invalid icmsg negotiate\n");
+               return false;
+       }
+
        icmsghdrp->icmsgsize = 0x10;
-       negop = (struct icmsg_negotiate *)&buf[
-               sizeof(struct vmbuspipe_hdr) +
-               sizeof(struct icmsg_hdr)];
+       negop = (struct icmsg_negotiate *)&buf[ICMSG_HDR];
 
        icframe_major = negop->icframe_vercnt;
        icframe_minor = 0;
@@ -226,6 +248,15 @@ bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp,
        icmsg_major = negop->icmsg_vercnt;
        icmsg_minor = 0;
 
+       /* Validate negop packet */
+       if (icframe_major > IC_VERSION_NEGOTIATION_MAX_VER_COUNT ||
+           icmsg_major > IC_VERSION_NEGOTIATION_MAX_VER_COUNT ||
+           ICMSG_NEGOTIATE_PKT_SIZE(icframe_major, icmsg_major) > buflen) {
+               pr_err_ratelimited("Invalid icmsg negotiate - icframe_major: %u, icmsg_major: %u\n",
+                                  icframe_major, icmsg_major);
+               goto fw_error;
+       }
+
        /*
         * Select the framework version number we will
         * support.
@@ -889,6 +920,20 @@ find_primary_channel_by_offer(const struct vmbus_channel_offer_channel *offer)
        return channel;
 }
 
+static bool vmbus_is_valid_device(const guid_t *guid)
+{
+       u16 i;
+
+       if (!hv_is_isolation_supported())
+               return true;
+
+       for (i = 0; i < ARRAY_SIZE(vmbus_devs); i++) {
+               if (guid_equal(guid, &vmbus_devs[i].guid))
+                       return vmbus_devs[i].allowed_in_isolated;
+       }
+       return false;
+}
+
 /*
  * vmbus_onoffer - Handler for channel offers from vmbus in parent partition.
  *
@@ -903,6 +948,13 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
 
        trace_vmbus_onoffer(offer);
 
+       if (!vmbus_is_valid_device(&offer->offer.if_type)) {
+               pr_err_ratelimited("Invalid offer %d from the host supporting isolation\n",
+                                  offer->child_relid);
+               atomic_dec(&vmbus_connection.offer_in_progress);
+               return;
+       }
+
        oldchannel = find_primary_channel_by_offer(offer);
 
        if (oldchannel != NULL) {
@@ -1049,6 +1101,18 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
 
        mutex_lock(&vmbus_connection.channel_mutex);
        channel = relid2channel(rescind->child_relid);
+       if (channel != NULL) {
+               /*
+                * Guarantee that no other instance of vmbus_onoffer_rescind()
+                * has got a reference to the channel object.  Synchronize on
+                * &vmbus_connection.channel_mutex.
+                */
+               if (channel->rescind_ref) {
+                       mutex_unlock(&vmbus_connection.channel_mutex);
+                       return;
+               }
+               channel->rescind_ref = true;
+       }
        mutex_unlock(&vmbus_connection.channel_mutex);
 
        if (channel == NULL) {
@@ -1102,8 +1166,7 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
                        vmbus_device_unregister(channel->device_obj);
                        put_device(dev);
                }
-       }
-       if (channel->primary_channel != NULL) {
+       } else if (channel->primary_channel != NULL) {
                /*
                 * Sub-channel is being rescinded. Following is the channel
                 * close sequence when initiated from the driveri (refer to
index 11170d9..c83612c 100644 (file)
@@ -244,6 +244,13 @@ int vmbus_connect(void)
                        break;
        }
 
+       if (hv_is_isolation_supported() && version < VERSION_WIN10_V5_2) {
+               pr_err("Invalid VMBus version %d.%d (expected >= %d.%d) from the host supporting isolation\n",
+                      version >> 16, version & 0xFFFF, VERSION_WIN10_V5_2 >> 16, VERSION_WIN10_V5_2 & 0xFFFF);
+               ret = -EINVAL;
+               goto cleanup;
+       }
+
        vmbus_proto_version = version;
        pr_info("Vmbus version:%d.%d\n",
                version >> 16, version & 0xFFFF);
index 5040d7e..59ce85e 100644 (file)
@@ -235,15 +235,27 @@ void hv_fcopy_onchannelcallback(void *context)
        if (fcopy_transaction.state > HVUTIL_READY)
                return;
 
-       vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen,
-                        &requestid);
-       if (recvlen <= 0)
+       if (vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen, &requestid)) {
+               pr_err_ratelimited("Fcopy request received. Could not read into recv buf\n");
                return;
+       }
+
+       if (!recvlen)
+               return;
+
+       /* Ensure recvlen is big enough to read header data */
+       if (recvlen < ICMSG_HDR) {
+               pr_err_ratelimited("Fcopy request received. Packet length too small: %d\n",
+                                  recvlen);
+               return;
+       }
 
        icmsghdr = (struct icmsg_hdr *)&recv_buffer[
                        sizeof(struct vmbuspipe_hdr)];
+
        if (icmsghdr->icmsgtype == ICMSGTYPE_NEGOTIATE) {
-               if (vmbus_prep_negotiate_resp(icmsghdr, recv_buffer,
+               if (vmbus_prep_negotiate_resp(icmsghdr,
+                               recv_buffer, recvlen,
                                fw_versions, FW_VER_COUNT,
                                fcopy_versions, FCOPY_VER_COUNT,
                                NULL, &fcopy_srv_version)) {
@@ -252,10 +264,14 @@ void hv_fcopy_onchannelcallback(void *context)
                                fcopy_srv_version >> 16,
                                fcopy_srv_version & 0xFFFF);
                }
-       } else {
-               fcopy_msg = (struct hv_fcopy_hdr *)&recv_buffer[
-                               sizeof(struct vmbuspipe_hdr) +
-                               sizeof(struct icmsg_hdr)];
+       } else if (icmsghdr->icmsgtype == ICMSGTYPE_FCOPY) {
+               /* Ensure recvlen is big enough to contain hv_fcopy_hdr */
+               if (recvlen < ICMSG_HDR + sizeof(struct hv_fcopy_hdr)) {
+                       pr_err_ratelimited("Invalid Fcopy hdr. Packet length too small: %u\n",
+                                          recvlen);
+                       return;
+               }
+               fcopy_msg = (struct hv_fcopy_hdr *)&recv_buffer[ICMSG_HDR];
 
                /*
                 * Stash away this global state for completing the
@@ -280,6 +296,10 @@ void hv_fcopy_onchannelcallback(void *context)
                schedule_delayed_work(&fcopy_timeout_work,
                                      HV_UTIL_TIMEOUT * HZ);
                return;
+       } else {
+               pr_err_ratelimited("Fcopy request received. Invalid msg type: %d\n",
+                                  icmsghdr->icmsgtype);
+               return;
        }
        icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE;
        vmbus_sendpacket(channel, recv_buffer, recvlen, requestid,
index 754d35a..b49962d 100644 (file)
@@ -662,71 +662,87 @@ void hv_kvp_onchannelcallback(void *context)
        if (kvp_transaction.state > HVUTIL_READY)
                return;
 
-       vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 4, &recvlen,
-                        &requestid);
-
-       if (recvlen > 0) {
-               icmsghdrp = (struct icmsg_hdr *)&recv_buffer[
-                       sizeof(struct vmbuspipe_hdr)];
-
-               if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
-                       if (vmbus_prep_negotiate_resp(icmsghdrp,
-                                recv_buffer, fw_versions, FW_VER_COUNT,
-                                kvp_versions, KVP_VER_COUNT,
-                                NULL, &kvp_srv_version)) {
-                               pr_info("KVP IC version %d.%d\n",
-                                       kvp_srv_version >> 16,
-                                       kvp_srv_version & 0xFFFF);
-                       }
-               } else {
-                       kvp_msg = (struct hv_kvp_msg *)&recv_buffer[
-                               sizeof(struct vmbuspipe_hdr) +
-                               sizeof(struct icmsg_hdr)];
+       if (vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 4, &recvlen, &requestid)) {
+               pr_err_ratelimited("KVP request received. Could not read into recv buf\n");
+               return;
+       }
 
-                       /*
-                        * Stash away this global state for completing the
-                        * transaction; note transactions are serialized.
-                        */
+       if (!recvlen)
+               return;
 
-                       kvp_transaction.recv_len = recvlen;
-                       kvp_transaction.recv_req_id = requestid;
-                       kvp_transaction.kvp_msg = kvp_msg;
+       /* Ensure recvlen is big enough to read header data */
+       if (recvlen < ICMSG_HDR) {
+               pr_err_ratelimited("KVP request received. Packet length too small: %d\n",
+                                  recvlen);
+               return;
+       }
 
-                       if (kvp_transaction.state < HVUTIL_READY) {
-                               /* Userspace is not registered yet */
-                               kvp_respond_to_host(NULL, HV_E_FAIL);
-                               return;
-                       }
-                       kvp_transaction.state = HVUTIL_HOSTMSG_RECEIVED;
+       icmsghdrp = (struct icmsg_hdr *)&recv_buffer[sizeof(struct vmbuspipe_hdr)];
+
+       if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
+               if (vmbus_prep_negotiate_resp(icmsghdrp,
+                               recv_buffer, recvlen,
+                               fw_versions, FW_VER_COUNT,
+                               kvp_versions, KVP_VER_COUNT,
+                               NULL, &kvp_srv_version)) {
+                       pr_info("KVP IC version %d.%d\n",
+                               kvp_srv_version >> 16,
+                               kvp_srv_version & 0xFFFF);
+               }
+       } else if (icmsghdrp->icmsgtype == ICMSGTYPE_KVPEXCHANGE) {
+               /*
+                * recvlen is not checked against sizeof(struct kvp_msg) because kvp_msg contains
+                * a union of structs and the msg type received is not known. Code using this
+                * struct should provide validation when accessing its fields.
+                */
+               kvp_msg = (struct hv_kvp_msg *)&recv_buffer[ICMSG_HDR];
 
-                       /*
-                        * Get the information from the
-                        * user-mode component.
-                        * component. This transaction will be
-                        * completed when we get the value from
-                        * the user-mode component.
-                        * Set a timeout to deal with
-                        * user-mode not responding.
-                        */
-                       schedule_work(&kvp_sendkey_work);
-                       schedule_delayed_work(&kvp_timeout_work,
-                                             HV_UTIL_TIMEOUT * HZ);
+               /*
+                * Stash away this global state for completing the
+                * transaction; note transactions are serialized.
+                */
 
-                       return;
+               kvp_transaction.recv_len = recvlen;
+               kvp_transaction.recv_req_id = requestid;
+               kvp_transaction.kvp_msg = kvp_msg;
 
+               if (kvp_transaction.state < HVUTIL_READY) {
+                       /* Userspace is not registered yet */
+                       kvp_respond_to_host(NULL, HV_E_FAIL);
+                       return;
                }
+               kvp_transaction.state = HVUTIL_HOSTMSG_RECEIVED;
 
-               icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
-                       | ICMSGHDRFLAG_RESPONSE;
+               /*
+                * Get the information from the
+                * user-mode component.
+                * component. This transaction will be
+                * completed when we get the value from
+                * the user-mode component.
+                * Set a timeout to deal with
+                * user-mode not responding.
+                */
+               schedule_work(&kvp_sendkey_work);
+               schedule_delayed_work(&kvp_timeout_work,
+                                       HV_UTIL_TIMEOUT * HZ);
 
-               vmbus_sendpacket(channel, recv_buffer,
-                                      recvlen, requestid,
-                                      VM_PKT_DATA_INBAND, 0);
+               return;
 
-               host_negotiatied = NEGO_FINISHED;
-               hv_poll_channel(kvp_transaction.recv_channel, kvp_poll_wrapper);
+       } else {
+               pr_err_ratelimited("KVP request received. Invalid msg type: %d\n",
+                                  icmsghdrp->icmsgtype);
+               return;
        }
 
+       icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
+               | ICMSGHDRFLAG_RESPONSE;
+
+       vmbus_sendpacket(channel, recv_buffer,
+                        recvlen, requestid,
+                        VM_PKT_DATA_INBAND, 0);
+
+       host_negotiatied = NEGO_FINISHED;
+       hv_poll_channel(kvp_transaction.recv_channel, kvp_poll_wrapper);
 }
 
 static void kvp_on_reset(void)
index 783779e..2267bd4 100644 (file)
@@ -298,49 +298,64 @@ void hv_vss_onchannelcallback(void *context)
        if (vss_transaction.state > HVUTIL_READY)
                return;
 
-       vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen,
-                        &requestid);
-
-       if (recvlen > 0) {
-               icmsghdrp = (struct icmsg_hdr *)&recv_buffer[
-                       sizeof(struct vmbuspipe_hdr)];
-
-               if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
-                       if (vmbus_prep_negotiate_resp(icmsghdrp,
-                                recv_buffer, fw_versions, FW_VER_COUNT,
-                                vss_versions, VSS_VER_COUNT,
-                                NULL, &vss_srv_version)) {
-
-                               pr_info("VSS IC version %d.%d\n",
-                                       vss_srv_version >> 16,
-                                       vss_srv_version & 0xFFFF);
-                       }
-               } else {
-                       vss_msg = (struct hv_vss_msg *)&recv_buffer[
-                               sizeof(struct vmbuspipe_hdr) +
-                               sizeof(struct icmsg_hdr)];
-
-                       /*
-                        * Stash away this global state for completing the
-                        * transaction; note transactions are serialized.
-                        */
-
-                       vss_transaction.recv_len = recvlen;
-                       vss_transaction.recv_req_id = requestid;
-                       vss_transaction.msg = (struct hv_vss_msg *)vss_msg;
-
-                       schedule_work(&vss_handle_request_work);
+       if (vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen, &requestid)) {
+               pr_err_ratelimited("VSS request received. Could not read into recv buf\n");
+               return;
+       }
+
+       if (!recvlen)
+               return;
+
+       /* Ensure recvlen is big enough to read header data */
+       if (recvlen < ICMSG_HDR) {
+               pr_err_ratelimited("VSS request received. Packet length too small: %d\n",
+                                  recvlen);
+               return;
+       }
+
+       icmsghdrp = (struct icmsg_hdr *)&recv_buffer[sizeof(struct vmbuspipe_hdr)];
+
+       if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
+               if (vmbus_prep_negotiate_resp(icmsghdrp,
+                               recv_buffer, recvlen,
+                               fw_versions, FW_VER_COUNT,
+                               vss_versions, VSS_VER_COUNT,
+                               NULL, &vss_srv_version)) {
+
+                       pr_info("VSS IC version %d.%d\n",
+                               vss_srv_version >> 16,
+                               vss_srv_version & 0xFFFF);
+               }
+       } else if (icmsghdrp->icmsgtype == ICMSGTYPE_VSS) {
+               /* Ensure recvlen is big enough to contain hv_vss_msg */
+               if (recvlen < ICMSG_HDR + sizeof(struct hv_vss_msg)) {
+                       pr_err_ratelimited("Invalid VSS msg. Packet length too small: %u\n",
+                                          recvlen);
                        return;
                }
+               vss_msg = (struct hv_vss_msg *)&recv_buffer[ICMSG_HDR];
+
+               /*
+                * Stash away this global state for completing the
+                * transaction; note transactions are serialized.
+                */
 
-               icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
-                       | ICMSGHDRFLAG_RESPONSE;
+               vss_transaction.recv_len = recvlen;
+               vss_transaction.recv_req_id = requestid;
+               vss_transaction.msg = (struct hv_vss_msg *)vss_msg;
 
-               vmbus_sendpacket(channel, recv_buffer,
-                                      recvlen, requestid,
-                                      VM_PKT_DATA_INBAND, 0);
+               schedule_work(&vss_handle_request_work);
+               return;
+       } else {
+               pr_err_ratelimited("VSS request received. Invalid msg type: %d\n",
+                                  icmsghdrp->icmsgtype);
+               return;
        }
 
+       icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION |
+               ICMSGHDRFLAG_RESPONSE;
+       vmbus_sendpacket(channel, recv_buffer, recvlen, requestid,
+                        VM_PKT_DATA_INBAND, 0);
 }
 
 static void vss_on_reset(void)
index 05566ec..e4aefeb 100644 (file)
@@ -195,73 +195,88 @@ static void shutdown_onchannelcallback(void *context)
 
        struct icmsg_hdr *icmsghdrp;
 
-       vmbus_recvpacket(channel, shut_txf_buf,
-                        HV_HYP_PAGE_SIZE, &recvlen, &requestid);
+       if (vmbus_recvpacket(channel, shut_txf_buf, HV_HYP_PAGE_SIZE, &recvlen, &requestid)) {
+               pr_err_ratelimited("Shutdown request received. Could not read into shut txf buf\n");
+               return;
+       }
 
-       if (recvlen > 0) {
-               icmsghdrp = (struct icmsg_hdr *)&shut_txf_buf[
-                       sizeof(struct vmbuspipe_hdr)];
+       if (!recvlen)
+               return;
 
-               if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
-                       if (vmbus_prep_negotiate_resp(icmsghdrp, shut_txf_buf,
-                                       fw_versions, FW_VER_COUNT,
-                                       sd_versions, SD_VER_COUNT,
-                                       NULL, &sd_srv_version)) {
-                               pr_info("Shutdown IC version %d.%d\n",
-                                       sd_srv_version >> 16,
-                                       sd_srv_version & 0xFFFF);
-                       }
-               } else {
-                       shutdown_msg =
-                               (struct shutdown_msg_data *)&shut_txf_buf[
-                                       sizeof(struct vmbuspipe_hdr) +
-                                       sizeof(struct icmsg_hdr)];
+       /* Ensure recvlen is big enough to read header data */
+       if (recvlen < ICMSG_HDR) {
+               pr_err_ratelimited("Shutdown request received. Packet length too small: %d\n",
+                                  recvlen);
+               return;
+       }
 
-                       /*
-                        * shutdown_msg->flags can be 0(shut down), 2(reboot),
-                        * or 4(hibernate). It may bitwise-OR 1, which means
-                        * performing the request by force. Linux always tries
-                        * to perform the request by force.
-                        */
-                       switch (shutdown_msg->flags) {
-                       case 0:
-                       case 1:
-                               icmsghdrp->status = HV_S_OK;
-                               work = &shutdown_work;
-                               pr_info("Shutdown request received -"
-                                           " graceful shutdown initiated\n");
-                               break;
-                       case 2:
-                       case 3:
-                               icmsghdrp->status = HV_S_OK;
-                               work = &restart_work;
-                               pr_info("Restart request received -"
-                                           " graceful restart initiated\n");
-                               break;
-                       case 4:
-                       case 5:
-                               pr_info("Hibernation request received\n");
-                               icmsghdrp->status = hibernation_supported ?
-                                       HV_S_OK : HV_E_FAIL;
-                               if (hibernation_supported)
-                                       work = &hibernate_context.work;
-                               break;
-                       default:
-                               icmsghdrp->status = HV_E_FAIL;
-                               pr_info("Shutdown request received -"
-                                           " Invalid request\n");
-                               break;
-                       }
+       icmsghdrp = (struct icmsg_hdr *)&shut_txf_buf[sizeof(struct vmbuspipe_hdr)];
+
+       if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
+               if (vmbus_prep_negotiate_resp(icmsghdrp,
+                               shut_txf_buf, recvlen,
+                               fw_versions, FW_VER_COUNT,
+                               sd_versions, SD_VER_COUNT,
+                               NULL, &sd_srv_version)) {
+                       pr_info("Shutdown IC version %d.%d\n",
+                               sd_srv_version >> 16,
+                               sd_srv_version & 0xFFFF);
+               }
+       } else if (icmsghdrp->icmsgtype == ICMSGTYPE_SHUTDOWN) {
+               /* Ensure recvlen is big enough to contain shutdown_msg_data struct */
+               if (recvlen < ICMSG_HDR + sizeof(struct shutdown_msg_data)) {
+                       pr_err_ratelimited("Invalid shutdown msg data. Packet length too small: %u\n",
+                                          recvlen);
+                       return;
                }
 
-               icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
-                       | ICMSGHDRFLAG_RESPONSE;
-
-               vmbus_sendpacket(channel, shut_txf_buf,
-                                      recvlen, requestid,
-                                      VM_PKT_DATA_INBAND, 0);
+               shutdown_msg = (struct shutdown_msg_data *)&shut_txf_buf[ICMSG_HDR];
+
+               /*
+                * shutdown_msg->flags can be 0(shut down), 2(reboot),
+                * or 4(hibernate). It may bitwise-OR 1, which means
+                * performing the request by force. Linux always tries
+                * to perform the request by force.
+                */
+               switch (shutdown_msg->flags) {
+               case 0:
+               case 1:
+                       icmsghdrp->status = HV_S_OK;
+                       work = &shutdown_work;
+                       pr_info("Shutdown request received - graceful shutdown initiated\n");
+                       break;
+               case 2:
+               case 3:
+                       icmsghdrp->status = HV_S_OK;
+                       work = &restart_work;
+                       pr_info("Restart request received - graceful restart initiated\n");
+                       break;
+               case 4:
+               case 5:
+                       pr_info("Hibernation request received\n");
+                       icmsghdrp->status = hibernation_supported ?
+                               HV_S_OK : HV_E_FAIL;
+                       if (hibernation_supported)
+                               work = &hibernate_context.work;
+                       break;
+               default:
+                       icmsghdrp->status = HV_E_FAIL;
+                       pr_info("Shutdown request received - Invalid request\n");
+                       break;
+               }
+       } else {
+               icmsghdrp->status = HV_E_FAIL;
+               pr_err_ratelimited("Shutdown request received. Invalid msg type: %d\n",
+                                  icmsghdrp->icmsgtype);
        }
 
+       icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
+               | ICMSGHDRFLAG_RESPONSE;
+
+       vmbus_sendpacket(channel, shut_txf_buf,
+                        recvlen, requestid,
+                        VM_PKT_DATA_INBAND, 0);
+
        if (work)
                schedule_work(work);
 }
@@ -396,19 +411,27 @@ static void timesync_onchannelcallback(void *context)
                                           HV_HYP_PAGE_SIZE, &recvlen,
                                           &requestid);
                if (ret) {
-                       pr_warn_once("TimeSync IC pkt recv failed (Err: %d)\n",
-                                    ret);
+                       pr_err_ratelimited("TimeSync IC pkt recv failed (Err: %d)\n",
+                                          ret);
                        break;
                }
 
                if (!recvlen)
                        break;
 
+               /* Ensure recvlen is big enough to read header data */
+               if (recvlen < ICMSG_HDR) {
+                       pr_err_ratelimited("Timesync request received. Packet length too small: %d\n",
+                                          recvlen);
+                       break;
+               }
+
                icmsghdrp = (struct icmsg_hdr *)&time_txf_buf[
                                sizeof(struct vmbuspipe_hdr)];
 
                if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
-                       if (vmbus_prep_negotiate_resp(icmsghdrp, time_txf_buf,
+                       if (vmbus_prep_negotiate_resp(icmsghdrp,
+                                               time_txf_buf, recvlen,
                                                fw_versions, FW_VER_COUNT,
                                                ts_versions, TS_VER_COUNT,
                                                NULL, &ts_srv_version)) {
@@ -416,33 +439,44 @@ static void timesync_onchannelcallback(void *context)
                                        ts_srv_version >> 16,
                                        ts_srv_version & 0xFFFF);
                        }
-               } else {
+               } else if (icmsghdrp->icmsgtype == ICMSGTYPE_TIMESYNC) {
                        if (ts_srv_version > TS_VERSION_3) {
-                               refdata = (struct ictimesync_ref_data *)
-                                       &time_txf_buf[
-                                       sizeof(struct vmbuspipe_hdr) +
-                                       sizeof(struct icmsg_hdr)];
+                               /* Ensure recvlen is big enough to read ictimesync_ref_data */
+                               if (recvlen < ICMSG_HDR + sizeof(struct ictimesync_ref_data)) {
+                                       pr_err_ratelimited("Invalid ictimesync ref data. Length too small: %u\n",
+                                                          recvlen);
+                                       break;
+                               }
+                               refdata = (struct ictimesync_ref_data *)&time_txf_buf[ICMSG_HDR];
 
                                adj_guesttime(refdata->parenttime,
                                                refdata->vmreferencetime,
                                                refdata->flags);
                        } else {
-                               timedatap = (struct ictimesync_data *)
-                                       &time_txf_buf[
-                                       sizeof(struct vmbuspipe_hdr) +
-                                       sizeof(struct icmsg_hdr)];
+                               /* Ensure recvlen is big enough to read ictimesync_data */
+                               if (recvlen < ICMSG_HDR + sizeof(struct ictimesync_data)) {
+                                       pr_err_ratelimited("Invalid ictimesync data. Length too small: %u\n",
+                                                          recvlen);
+                                       break;
+                               }
+                               timedatap = (struct ictimesync_data *)&time_txf_buf[ICMSG_HDR];
+
                                adj_guesttime(timedatap->parenttime,
                                              hv_read_reference_counter(),
                                              timedatap->flags);
                        }
+               } else {
+                       icmsghdrp->status = HV_E_FAIL;
+                       pr_err_ratelimited("Timesync request received. Invalid msg type: %d\n",
+                                          icmsghdrp->icmsgtype);
                }
 
                icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
                        | ICMSGHDRFLAG_RESPONSE;
 
                vmbus_sendpacket(channel, time_txf_buf,
-                               recvlen, requestid,
-                               VM_PKT_DATA_INBAND, 0);
+                                recvlen, requestid,
+                                VM_PKT_DATA_INBAND, 0);
        }
 }
 
@@ -462,18 +496,28 @@ static void heartbeat_onchannelcallback(void *context)
 
        while (1) {
 
-               vmbus_recvpacket(channel, hbeat_txf_buf,
-                                HV_HYP_PAGE_SIZE, &recvlen, &requestid);
+               if (vmbus_recvpacket(channel, hbeat_txf_buf, HV_HYP_PAGE_SIZE,
+                                    &recvlen, &requestid)) {
+                       pr_err_ratelimited("Heartbeat request received. Could not read into hbeat txf buf\n");
+                       return;
+               }
 
                if (!recvlen)
                        break;
 
+               /* Ensure recvlen is big enough to read header data */
+               if (recvlen < ICMSG_HDR) {
+                       pr_err_ratelimited("Heartbeat request received. Packet length too small: %d\n",
+                                          recvlen);
+                       break;
+               }
+
                icmsghdrp = (struct icmsg_hdr *)&hbeat_txf_buf[
                                sizeof(struct vmbuspipe_hdr)];
 
                if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
                        if (vmbus_prep_negotiate_resp(icmsghdrp,
-                                       hbeat_txf_buf,
+                                       hbeat_txf_buf, recvlen,
                                        fw_versions, FW_VER_COUNT,
                                        hb_versions, HB_VER_COUNT,
                                        NULL, &hb_srv_version)) {
@@ -482,21 +526,31 @@ static void heartbeat_onchannelcallback(void *context)
                                        hb_srv_version >> 16,
                                        hb_srv_version & 0xFFFF);
                        }
-               } else {
-                       heartbeat_msg =
-                               (struct heartbeat_msg_data *)&hbeat_txf_buf[
-                                       sizeof(struct vmbuspipe_hdr) +
-                                       sizeof(struct icmsg_hdr)];
+               } else if (icmsghdrp->icmsgtype == ICMSGTYPE_HEARTBEAT) {
+                       /*
+                        * Ensure recvlen is big enough to read seq_num. Reserved area is not
+                        * included in the check as the host may not fill it up entirely
+                        */
+                       if (recvlen < ICMSG_HDR + sizeof(u64)) {
+                               pr_err_ratelimited("Invalid heartbeat msg data. Length too small: %u\n",
+                                                  recvlen);
+                               break;
+                       }
+                       heartbeat_msg = (struct heartbeat_msg_data *)&hbeat_txf_buf[ICMSG_HDR];
 
                        heartbeat_msg->seq_num += 1;
+               } else {
+                       icmsghdrp->status = HV_E_FAIL;
+                       pr_err_ratelimited("Heartbeat request received. Invalid msg type: %d\n",
+                                          icmsghdrp->icmsgtype);
                }
 
                icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
                        | ICMSGHDRFLAG_RESPONSE;
 
                vmbus_sendpacket(channel, hbeat_txf_buf,
-                                      recvlen, requestid,
-                                      VM_PKT_DATA_INBAND, 0);
+                                recvlen, requestid,
+                                VM_PKT_DATA_INBAND, 0);
        }
 }
 
index d491fdc..10dce9f 100644 (file)
@@ -678,6 +678,23 @@ static const struct attribute_group vmbus_dev_group = {
 };
 __ATTRIBUTE_GROUPS(vmbus_dev);
 
+/* Set up the attribute for /sys/bus/vmbus/hibernation */
+static ssize_t hibernation_show(struct bus_type *bus, char *buf)
+{
+       return sprintf(buf, "%d\n", !!hv_is_hibernation_supported());
+}
+
+static BUS_ATTR_RO(hibernation);
+
+static struct attribute *vmbus_bus_attrs[] = {
+       &bus_attr_hibernation.attr,
+       NULL,
+};
+static const struct attribute_group vmbus_bus_group = {
+       .attrs = vmbus_bus_attrs,
+};
+__ATTRIBUTE_GROUPS(vmbus_bus);
+
 /*
  * vmbus_uevent - add uevent for our device
  *
@@ -1024,6 +1041,7 @@ static struct bus_type  hv_bus = {
        .uevent =               vmbus_uevent,
        .dev_groups =           vmbus_dev_groups,
        .drv_groups =           vmbus_drv_groups,
+       .bus_groups =           vmbus_bus_groups,
        .pm =                   &vmbus_pm,
 };
 
@@ -1054,12 +1072,14 @@ void vmbus_on_msg_dpc(unsigned long data)
 {
        struct hv_per_cpu_context *hv_cpu = (void *)data;
        void *page_addr = hv_cpu->synic_message_page;
-       struct hv_message *msg = (struct hv_message *)page_addr +
+       struct hv_message msg_copy, *msg = (struct hv_message *)page_addr +
                                  VMBUS_MESSAGE_SINT;
        struct vmbus_channel_message_header *hdr;
+       enum vmbus_channel_message_type msgtype;
        const struct vmbus_channel_message_table_entry *entry;
        struct onmessage_work_context *ctx;
-       u32 message_type = msg->header.message_type;
+       __u8 payload_size;
+       u32 message_type;
 
        /*
         * 'enum vmbus_channel_message_type' is supposed to always be 'u32' as
@@ -1068,45 +1088,52 @@ void vmbus_on_msg_dpc(unsigned long data)
         */
        BUILD_BUG_ON(sizeof(enum vmbus_channel_message_type) != sizeof(u32));
 
+       /*
+        * Since the message is in memory shared with the host, an erroneous or
+        * malicious Hyper-V could modify the message while vmbus_on_msg_dpc()
+        * or individual message handlers are executing; to prevent this, copy
+        * the message into private memory.
+        */
+       memcpy(&msg_copy, msg, sizeof(struct hv_message));
+
+       message_type = msg_copy.header.message_type;
        if (message_type == HVMSG_NONE)
                /* no msg */
                return;
 
-       hdr = (struct vmbus_channel_message_header *)msg->u.payload;
+       hdr = (struct vmbus_channel_message_header *)msg_copy.u.payload;
+       msgtype = hdr->msgtype;
 
        trace_vmbus_on_msg_dpc(hdr);
 
-       if (hdr->msgtype >= CHANNELMSG_COUNT) {
-               WARN_ONCE(1, "unknown msgtype=%d\n", hdr->msgtype);
+       if (msgtype >= CHANNELMSG_COUNT) {
+               WARN_ONCE(1, "unknown msgtype=%d\n", msgtype);
                goto msg_handled;
        }
 
-       if (msg->header.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) {
-               WARN_ONCE(1, "payload size is too large (%d)\n",
-                         msg->header.payload_size);
+       payload_size = msg_copy.header.payload_size;
+       if (payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) {
+               WARN_ONCE(1, "payload size is too large (%d)\n", payload_size);
                goto msg_handled;
        }
 
-       entry = &channel_message_table[hdr->msgtype];
+       entry = &channel_message_table[msgtype];
 
        if (!entry->message_handler)
                goto msg_handled;
 
-       if (msg->header.payload_size < entry->min_payload_len) {
-               WARN_ONCE(1, "message too short: msgtype=%d len=%d\n",
-                         hdr->msgtype, msg->header.payload_size);
+       if (payload_size < entry->min_payload_len) {
+               WARN_ONCE(1, "message too short: msgtype=%d len=%d\n", msgtype, payload_size);
                goto msg_handled;
        }
 
        if (entry->handler_type == VMHT_BLOCKING) {
-               ctx = kmalloc(sizeof(*ctx) + msg->header.payload_size,
-                             GFP_ATOMIC);
+               ctx = kmalloc(sizeof(*ctx) + payload_size, GFP_ATOMIC);
                if (ctx == NULL)
                        return;
 
                INIT_WORK(&ctx->work, vmbus_onmessage_work);
-               memcpy(&ctx->msg, msg, sizeof(msg->header) +
-                      msg->header.payload_size);
+               memcpy(&ctx->msg, &msg_copy, sizeof(msg->header) + payload_size);
 
                /*
                 * The host can generate a rescind message while we
@@ -1115,7 +1142,7 @@ void vmbus_on_msg_dpc(unsigned long data)
                 * by offer_in_progress and by channel_mutex.  See also the
                 * inline comments in vmbus_onoffer_rescind().
                 */
-               switch (hdr->msgtype) {
+               switch (msgtype) {
                case CHANNELMSG_RESCIND_CHANNELOFFER:
                        /*
                         * If we are handling the rescind message;
@@ -2618,6 +2645,9 @@ static int __init hv_acpi_init(void)
        if (!hv_is_hyperv_initialized())
                return -ENODEV;
 
+       if (hv_root_partition)
+               return 0;
+
        init_completion(&probe_event);
 
        /*
index 1d21a0b..e285a22 100644 (file)
@@ -20,6 +20,7 @@
 #include <asm/io_apic.h>
 #include <asm/irq_remapping.h>
 #include <asm/hypervisor.h>
+#include <asm/mshyperv.h>
 
 #include "irq_remapping.h"
 
@@ -115,30 +116,43 @@ static const struct irq_domain_ops hyperv_ir_domain_ops = {
        .free = hyperv_irq_remapping_free,
 };
 
+static const struct irq_domain_ops hyperv_root_ir_domain_ops;
 static int __init hyperv_prepare_irq_remapping(void)
 {
        struct fwnode_handle *fn;
        int i;
+       const char *name;
+       const struct irq_domain_ops *ops;
 
        if (!hypervisor_is_type(X86_HYPER_MS_HYPERV) ||
            x86_init.hyper.msi_ext_dest_id() ||
            !x2apic_supported())
                return -ENODEV;
 
-       fn = irq_domain_alloc_named_id_fwnode("HYPERV-IR", 0);
+       if (hv_root_partition) {
+               name = "HYPERV-ROOT-IR";
+               ops = &hyperv_root_ir_domain_ops;
+       } else {
+               name = "HYPERV-IR";
+               ops = &hyperv_ir_domain_ops;
+       }
+
+       fn = irq_domain_alloc_named_id_fwnode(name, 0);
        if (!fn)
                return -ENOMEM;
 
        ioapic_ir_domain =
                irq_domain_create_hierarchy(arch_get_ir_parent_domain(),
-                               0, IOAPIC_REMAPPING_ENTRY, fn,
-                               &hyperv_ir_domain_ops, NULL);
+                               0, IOAPIC_REMAPPING_ENTRY, fn, ops, NULL);
 
        if (!ioapic_ir_domain) {
                irq_domain_free_fwnode(fn);
                return -ENOMEM;
        }
 
+       if (hv_root_partition)
+               return 0; /* The rest is only relevant to guests */
+
        /*
         * Hyper-V doesn't provide irq remapping function for
         * IO-APIC and so IO-APIC only accepts 8-bit APIC ID.
@@ -166,4 +180,161 @@ struct irq_remap_ops hyperv_irq_remap_ops = {
        .enable                 = hyperv_enable_irq_remapping,
 };
 
+/* IRQ remapping domain when Linux runs as the root partition */
+struct hyperv_root_ir_data {
+       u8 ioapic_id;
+       bool is_level;
+       struct hv_interrupt_entry entry;
+};
+
+static void
+hyperv_root_ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
+{
+       u64 status;
+       u32 vector;
+       struct irq_cfg *cfg;
+       int ioapic_id;
+       struct cpumask *affinity;
+       int cpu;
+       struct hv_interrupt_entry entry;
+       struct hyperv_root_ir_data *data = irq_data->chip_data;
+       struct IO_APIC_route_entry e;
+
+       cfg = irqd_cfg(irq_data);
+       affinity = irq_data_get_effective_affinity_mask(irq_data);
+       cpu = cpumask_first_and(affinity, cpu_online_mask);
+
+       vector = cfg->vector;
+       ioapic_id = data->ioapic_id;
+
+       if (data->entry.source == HV_DEVICE_TYPE_IOAPIC
+           && data->entry.ioapic_rte.as_uint64) {
+               entry = data->entry;
+
+               status = hv_unmap_ioapic_interrupt(ioapic_id, &entry);
+
+               if (status != HV_STATUS_SUCCESS)
+                       pr_debug("%s: unexpected unmap status %lld\n", __func__, status);
+
+               data->entry.ioapic_rte.as_uint64 = 0;
+               data->entry.source = 0; /* Invalid source */
+       }
+
+
+       status = hv_map_ioapic_interrupt(ioapic_id, data->is_level, cpu,
+                                       vector, &entry);
+
+       if (status != HV_STATUS_SUCCESS) {
+               pr_err("%s: map hypercall failed, status %lld\n", __func__, status);
+               return;
+       }
+
+       data->entry = entry;
+
+       /* Turn it into an IO_APIC_route_entry, and generate MSI MSG. */
+       e.w1 = entry.ioapic_rte.low_uint32;
+       e.w2 = entry.ioapic_rte.high_uint32;
+
+       memset(msg, 0, sizeof(*msg));
+       msg->arch_data.vector = e.vector;
+       msg->arch_data.delivery_mode = e.delivery_mode;
+       msg->arch_addr_lo.dest_mode_logical = e.dest_mode_logical;
+       msg->arch_addr_lo.dmar_format = e.ir_format;
+       msg->arch_addr_lo.dmar_index_0_14 = e.ir_index_0_14;
+}
+
+static int hyperv_root_ir_set_affinity(struct irq_data *data,
+               const struct cpumask *mask, bool force)
+{
+       struct irq_data *parent = data->parent_data;
+       struct irq_cfg *cfg = irqd_cfg(data);
+       int ret;
+
+       ret = parent->chip->irq_set_affinity(parent, mask, force);
+       if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+               return ret;
+
+       send_cleanup_vector(cfg);
+
+       return 0;
+}
+
+static struct irq_chip hyperv_root_ir_chip = {
+       .name                   = "HYPERV-ROOT-IR",
+       .irq_ack                = apic_ack_irq,
+       .irq_set_affinity       = hyperv_root_ir_set_affinity,
+       .irq_compose_msi_msg    = hyperv_root_ir_compose_msi_msg,
+};
+
+static int hyperv_root_irq_remapping_alloc(struct irq_domain *domain,
+                                    unsigned int virq, unsigned int nr_irqs,
+                                    void *arg)
+{
+       struct irq_alloc_info *info = arg;
+       struct irq_data *irq_data;
+       struct hyperv_root_ir_data *data;
+       int ret = 0;
+
+       if (!info || info->type != X86_IRQ_ALLOC_TYPE_IOAPIC || nr_irqs > 1)
+               return -EINVAL;
+
+       ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+       if (ret < 0)
+               return ret;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
+       if (!data) {
+               irq_domain_free_irqs_common(domain, virq, nr_irqs);
+               return -ENOMEM;
+       }
+
+       irq_data = irq_domain_get_irq_data(domain, virq);
+       if (!irq_data) {
+               kfree(data);
+               irq_domain_free_irqs_common(domain, virq, nr_irqs);
+               return -EINVAL;
+       }
+
+       data->ioapic_id = info->devid;
+       data->is_level = info->ioapic.is_level;
+
+       irq_data->chip = &hyperv_root_ir_chip;
+       irq_data->chip_data = data;
+
+       return 0;
+}
+
+static void hyperv_root_irq_remapping_free(struct irq_domain *domain,
+                                unsigned int virq, unsigned int nr_irqs)
+{
+       struct irq_data *irq_data;
+       struct hyperv_root_ir_data *data;
+       struct hv_interrupt_entry *e;
+       int i;
+
+       for (i = 0; i < nr_irqs; i++) {
+               irq_data = irq_domain_get_irq_data(domain, virq + i);
+
+               if (irq_data && irq_data->chip_data) {
+                       data = irq_data->chip_data;
+                       e = &data->entry;
+
+                       if (e->source == HV_DEVICE_TYPE_IOAPIC
+                             && e->ioapic_rte.as_uint64)
+                               hv_unmap_ioapic_interrupt(data->ioapic_id,
+                                                       &data->entry);
+
+                       kfree(data);
+               }
+       }
+
+       irq_domain_free_irqs_common(domain, virq, nr_irqs);
+}
+
+static const struct irq_domain_ops hyperv_root_ir_domain_ops = {
+       .select = hyperv_irq_remapping_select,
+       .alloc = hyperv_root_irq_remapping_alloc,
+       .free = hyperv_root_irq_remapping_free,
+};
+
 #endif
index dc3f73c..c64cc76 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/prefetch.h>
 
 #include <asm/sync_bitops.h>
+#include <asm/mshyperv.h>
 
 #include "hyperv_net.h"
 #include "netvsc_trace.h"
@@ -562,7 +563,10 @@ static int negotiate_nvsp_ver(struct hv_device *device,
        init_packet->msg.v2_msg.send_ndis_config.capability.ieee8021q = 1;
 
        if (nvsp_ver >= NVSP_PROTOCOL_VERSION_5) {
-               init_packet->msg.v2_msg.send_ndis_config.capability.sriov = 1;
+               if (hv_is_isolation_supported())
+                       netdev_info(ndev, "SR-IOV not advertised by guests on the host supporting isolation\n");
+               else
+                       init_packet->msg.v2_msg.send_ndis_config.capability.sriov = 1;
 
                /* Teaming bit is needed to receive link speed updates */
                init_packet->msg.v2_msg.send_ndis_config.capability.teaming = 1;
@@ -609,6 +613,13 @@ static int netvsc_connect_vsp(struct hv_device *device,
                goto cleanup;
        }
 
+       if (hv_is_isolation_supported() && net_device->nvsp_version < NVSP_PROTOCOL_VERSION_61) {
+               netdev_err(ndev, "Invalid NVSP version 0x%x (expected >= 0x%x) from the host supporting isolation\n",
+                          net_device->nvsp_version, NVSP_PROTOCOL_VERSION_61);
+               ret = -EPROTO;
+               goto cleanup;
+       }
+
        pr_debug("Negotiated NVSP version:%x\n", net_device->nvsp_version);
 
        /* Send the ndis version */
@@ -1416,7 +1427,10 @@ static void netvsc_receive_inband(struct net_device *ndev,
                break;
 
        case NVSP_MSG4_TYPE_SEND_VF_ASSOCIATION:
-               netvsc_send_vf(ndev, nvmsg, msglen);
+               if (hv_is_isolation_supported())
+                       netdev_err(ndev, "Ignore VF_ASSOCIATION msg from the host supporting isolation\n");
+               else
+                       netvsc_send_vf(ndev, nvmsg, msglen);
                break;
        }
 }
index 6db8d96..87aa62e 100644 (file)
@@ -1216,7 +1216,7 @@ static void hv_irq_unmask(struct irq_data *data)
        params = &hbus->retarget_msi_interrupt_params;
        memset(params, 0, sizeof(*params));
        params->partition_id = HV_PARTITION_ID_SELF;
-       params->int_entry.source = 1; /* MSI(-X) */
+       params->int_entry.source = HV_INTERRUPT_SOURCE_MSI;
        hv_set_msi_entry_from_desc(&params->int_entry.msi_entry, msi_desc);
        params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
                           (hbus->hdev->dev_instance.b[4] << 16) |
index a4c6ef8..40a91ce 100644 (file)
@@ -30,6 +30,10 @@ static inline int pxm_to_node(int pxm)
 {
        return 0;
 }
+static inline int node_to_pxm(int node)
+{
+       return 0;
+}
 #endif                         /* CONFIG_ACPI_NUMA */
 
 #ifdef CONFIG_ACPI_HMAT
index e73a118..83448e8 100644 (file)
@@ -88,7 +88,8 @@
 #define HV_CONNECT_PORT                                BIT(7)
 #define HV_ACCESS_STATS                                BIT(8)
 #define HV_DEBUGGING                           BIT(11)
-#define HV_CPU_POWER_MANAGEMENT                        BIT(12)
+#define HV_CPU_MANAGEMENT                      BIT(12)
+#define HV_ISOLATION                           BIT(22)
 
 
 /*
@@ -141,6 +142,9 @@ struct ms_hyperv_tsc_page {
 #define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX  0x0013
 #define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX   0x0014
 #define HVCALL_SEND_IPI_EX                     0x0015
+#define HVCALL_GET_PARTITION_ID                        0x0046
+#define HVCALL_DEPOSIT_MEMORY                  0x0048
+#define HVCALL_CREATE_VP                       0x004e
 #define HVCALL_GET_VP_REGISTERS                        0x0050
 #define HVCALL_SET_VP_REGISTERS                        0x0051
 #define HVCALL_POST_MESSAGE                    0x005c
@@ -148,6 +152,9 @@ struct ms_hyperv_tsc_page {
 #define HVCALL_POST_DEBUG_DATA                 0x0069
 #define HVCALL_RETRIEVE_DEBUG_DATA             0x006a
 #define HVCALL_RESET_DEBUG_SESSION             0x006b
+#define HVCALL_ADD_LOGICAL_PROCESSOR           0x0076
+#define HVCALL_MAP_DEVICE_INTERRUPT            0x007c
+#define HVCALL_UNMAP_DEVICE_INTERRUPT          0x007d
 #define HVCALL_RETARGET_INTERRUPT              0x007e
 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
@@ -407,19 +414,144 @@ struct hv_tlb_flush_ex {
        u64 gva_list[];
 } __packed;
 
+/* HvGetPartitionId hypercall (output only) */
+struct hv_get_partition_id {
+       u64 partition_id;
+} __packed;
+
+/* HvDepositMemory hypercall */
+struct hv_deposit_memory {
+       u64 partition_id;
+       u64 gpa_page_list[];
+} __packed;
+
+struct hv_proximity_domain_flags {
+       u32 proximity_preferred : 1;
+       u32 reserved : 30;
+       u32 proximity_info_valid : 1;
+} __packed;
+
+/* Not a union in windows but useful for zeroing */
+union hv_proximity_domain_info {
+       struct {
+               u32 domain_id;
+               struct hv_proximity_domain_flags flags;
+       };
+       u64 as_uint64;
+} __packed;
+
+struct hv_lp_startup_status {
+       u64 hv_status;
+       u64 substatus1;
+       u64 substatus2;
+       u64 substatus3;
+       u64 substatus4;
+       u64 substatus5;
+       u64 substatus6;
+} __packed;
+
+/* HvAddLogicalProcessor hypercall */
+struct hv_add_logical_processor_in {
+       u32 lp_index;
+       u32 apic_id;
+       union hv_proximity_domain_info proximity_domain_info;
+       u64 flags;
+} __packed;
+
+struct hv_add_logical_processor_out {
+       struct hv_lp_startup_status startup_status;
+} __packed;
+
+enum HV_SUBNODE_TYPE
+{
+    HvSubnodeAny = 0,
+    HvSubnodeSocket = 1,
+    HvSubnodeAmdNode = 2,
+    HvSubnodeL3 = 3,
+    HvSubnodeCount = 4,
+    HvSubnodeInvalid = -1
+};
+
+/* HvCreateVp hypercall */
+struct hv_create_vp {
+       u64 partition_id;
+       u32 vp_index;
+       u8 padding[3];
+       u8 subnode_type;
+       u64 subnode_id;
+       union hv_proximity_domain_info proximity_domain_info;
+       u64 flags;
+} __packed;
+
+enum hv_interrupt_source {
+       HV_INTERRUPT_SOURCE_MSI = 1, /* MSI and MSI-X */
+       HV_INTERRUPT_SOURCE_IOAPIC,
+};
+
+union hv_msi_address_register {
+       u32 as_uint32;
+       struct {
+               u32 reserved1:2;
+               u32 destination_mode:1;
+               u32 redirection_hint:1;
+               u32 reserved2:8;
+               u32 destination_id:8;
+               u32 msi_base:12;
+       };
+} __packed;
+
+union hv_msi_data_register {
+       u32 as_uint32;
+       struct {
+               u32 vector:8;
+               u32 delivery_mode:3;
+               u32 reserved1:3;
+               u32 level_assert:1;
+               u32 trigger_mode:1;
+               u32 reserved2:16;
+       };
+} __packed;
+
 /* HvRetargetDeviceInterrupt hypercall */
 union hv_msi_entry {
        u64 as_uint64;
        struct {
-               u32 address;
-               u32 data;
+               union hv_msi_address_register address;
+               union hv_msi_data_register data;
        } __packed;
 };
 
+union hv_ioapic_rte {
+       u64 as_uint64;
+
+       struct {
+               u32 vector:8;
+               u32 delivery_mode:3;
+               u32 destination_mode:1;
+               u32 delivery_status:1;
+               u32 interrupt_polarity:1;
+               u32 remote_irr:1;
+               u32 trigger_mode:1;
+               u32 interrupt_mask:1;
+               u32 reserved1:15;
+
+               u32 reserved2:24;
+               u32 destination_id:8;
+       };
+
+       struct {
+               u32 low_uint32;
+               u32 high_uint32;
+       };
+} __packed;
+
 struct hv_interrupt_entry {
-       u32 source;                     /* 1 for MSI(-X) */
+       u32 source;
        u32 reserved1;
-       union hv_msi_entry msi_entry;
+       union {
+               union hv_msi_entry msi_entry;
+               union hv_ioapic_rte ioapic_rte;
+       };
 } __packed;
 
 /*
@@ -494,4 +626,117 @@ struct hv_set_vp_registers_input {
        } element[];
 } __packed;
 
+enum hv_device_type {
+       HV_DEVICE_TYPE_LOGICAL = 0,
+       HV_DEVICE_TYPE_PCI = 1,
+       HV_DEVICE_TYPE_IOAPIC = 2,
+       HV_DEVICE_TYPE_ACPI = 3,
+};
+
+typedef u16 hv_pci_rid;
+typedef u16 hv_pci_segment;
+typedef u64 hv_logical_device_id;
+union hv_pci_bdf {
+       u16 as_uint16;
+
+       struct {
+               u8 function:3;
+               u8 device:5;
+               u8 bus;
+       };
+} __packed;
+
+union hv_pci_bus_range {
+       u16 as_uint16;
+
+       struct {
+               u8 subordinate_bus;
+               u8 secondary_bus;
+       };
+} __packed;
+
+union hv_device_id {
+       u64 as_uint64;
+
+       struct {
+               u64 reserved0:62;
+               u64 device_type:2;
+       };
+
+       /* HV_DEVICE_TYPE_LOGICAL */
+       struct {
+               u64 id:62;
+               u64 device_type:2;
+       } logical;
+
+       /* HV_DEVICE_TYPE_PCI */
+       struct {
+               union {
+                       hv_pci_rid rid;
+                       union hv_pci_bdf bdf;
+               };
+
+               hv_pci_segment segment;
+               union hv_pci_bus_range shadow_bus_range;
+
+               u16 phantom_function_bits:2;
+               u16 source_shadow:1;
+
+               u16 rsvdz0:11;
+               u16 device_type:2;
+       } pci;
+
+       /* HV_DEVICE_TYPE_IOAPIC */
+       struct {
+               u8 ioapic_id;
+               u8 rsvdz0;
+               u16 rsvdz1;
+               u16 rsvdz2;
+
+               u16 rsvdz3:14;
+               u16 device_type:2;
+       } ioapic;
+
+       /* HV_DEVICE_TYPE_ACPI */
+       struct {
+               u32 input_mapping_base;
+               u32 input_mapping_count:30;
+               u32 device_type:2;
+       } acpi;
+} __packed;
+
+enum hv_interrupt_trigger_mode {
+       HV_INTERRUPT_TRIGGER_MODE_EDGE = 0,
+       HV_INTERRUPT_TRIGGER_MODE_LEVEL = 1,
+};
+
+struct hv_device_interrupt_descriptor {
+       u32 interrupt_type;
+       u32 trigger_mode;
+       u32 vector_count;
+       u32 reserved;
+       struct hv_device_interrupt_target target;
+} __packed;
+
+struct hv_input_map_device_interrupt {
+       u64 partition_id;
+       u64 device_id;
+       u64 flags;
+       struct hv_interrupt_entry logical_interrupt_entry;
+       struct hv_device_interrupt_descriptor interrupt_descriptor;
+} __packed;
+
+struct hv_output_map_device_interrupt {
+       struct hv_interrupt_entry interrupt_entry;
+} __packed;
+
+struct hv_input_unmap_device_interrupt {
+       u64 partition_id;
+       u64 device_id;
+       struct hv_interrupt_entry interrupt_entry;
+} __packed;
+
+#define HV_SOURCE_SHADOW_NONE               0x0
+#define HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE   0x1
+
 #endif
index c577996..dff58a3 100644 (file)
 
 struct ms_hyperv_info {
        u32 features;
+       u32 features_b;
        u32 misc_features;
        u32 hints;
        u32 nested_features;
        u32 max_vp_index;
        u32 max_lp_index;
+       u32 isolation_config_a;
+       u32 isolation_config_b;
 };
 extern struct ms_hyperv_info ms_hyperv;
 
@@ -169,6 +172,8 @@ void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die);
 void hyperv_report_panic_msg(phys_addr_t pa, size_t size);
 bool hv_is_hyperv_initialized(void);
 bool hv_is_hibernation_supported(void);
+enum hv_isolation_type hv_get_isolation_type(void);
+bool hv_is_isolation_supported(void);
 void hyperv_cleanup(void);
 #else /* CONFIG_HYPERV */
 static inline bool hv_is_hyperv_initialized(void) { return false; }
index 5ddb479..f1d74dc 100644 (file)
@@ -785,6 +785,7 @@ struct vmbus_device {
        u16  dev_type;
        guid_t guid;
        bool perf_device;
+       bool allowed_in_isolated;
 };
 
 struct vmbus_channel {
@@ -803,6 +804,7 @@ struct vmbus_channel {
        u8 monitor_bit;
 
        bool rescind; /* got rescind msg */
+       bool rescind_ref; /* got rescind msg, got channel reference */
        struct completion rescind_event;
 
        u32 ringbuffer_gpadlhandle;
@@ -1471,6 +1473,7 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size);
 #define ICMSGTYPE_SHUTDOWN             3
 #define ICMSGTYPE_TIMESYNC             4
 #define ICMSGTYPE_VSS                  5
+#define ICMSGTYPE_FCOPY                        7
 
 #define ICMSGHDRFLAG_TRANSACTION       1
 #define ICMSGHDRFLAG_REQUEST           2
@@ -1514,11 +1517,17 @@ struct icmsg_hdr {
        u8 reserved[2];
 } __packed;
 
+#define IC_VERSION_NEGOTIATION_MAX_VER_COUNT 100
+#define ICMSG_HDR (sizeof(struct vmbuspipe_hdr) + sizeof(struct icmsg_hdr))
+#define ICMSG_NEGOTIATE_PKT_SIZE(icframe_vercnt, icmsg_vercnt) \
+       (ICMSG_HDR + sizeof(struct icmsg_negotiate) + \
+        (((icframe_vercnt) + (icmsg_vercnt)) * sizeof(struct ic_version)))
+
 struct icmsg_negotiate {
        u16 icframe_vercnt;
        u16 icmsg_vercnt;
        u32 reserved;
-       struct ic_version icversion_data[1]; /* any size array */
+       struct ic_version icversion_data[]; /* any size array */
 } __packed;
 
 struct shutdown_msg_data {
@@ -1569,7 +1578,7 @@ struct hyperv_service_callback {
 };
 
 #define MAX_SRV_VER    0x7ffffff
-extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf,
+extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf, u32 buflen,
                                const int *fw_version, int fw_vercnt,
                                const int *srv_version, int srv_vercnt,
                                int *nego_fw_version, int *nego_srv_version);