KVM: PPC: VFIO: Add in-kernel acceleration for VFIO

author Alexey Kardashevskiy <aik@ozlabs.ru>

Wed, 22 Mar 2017 04:21:56 +0000 (15:21 +1100)

committer Paul Mackerras <paulus@ozlabs.org>

Thu, 20 Apr 2017 01:39:26 +0000 (11:39 +1000)
author Alexey Kardashevskiy <aik@ozlabs.ru>
Wed, 22 Mar 2017 04:21:56 +0000 (15:21 +1100)
committer Paul Mackerras <paulus@ozlabs.org>
Thu, 20 Apr 2017 01:39:26 +0000 (11:39 +1000)
diff --git a/Documentation/virtual/kvm/devices/vfio.txt b/Documentation/virtual/kvm/devices/vfio.txt

index ef51740..528c77c 100644 (file)
--- a/Documentation/virtual/kvm/devices/vfio.txt
+++ b/Documentation/virtual/kvm/devices/vfio.txt
@@ -16,7 +16,21 @@ Groups:
  
  KVM_DEV_VFIO_GROUP attributes:
    KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking
+       kvm_device_attr.addr points to an int32_t file descriptor
+       for the VFIO group.
    KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking
+       kvm_device_attr.addr points to an int32_t file descriptor
+       for the VFIO group.
+  KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: attaches a guest visible TCE table
+       allocated by sPAPR KVM.
+       kvm_device_attr.addr points to a struct:
  
-For each, kvm_device_attr.addr points to an int32_t file descriptor
-for the VFIO group.
+       struct kvm_vfio_spapr_tce {
+               __s32   groupfd;
+               __s32   tablefd;
+       };
+
+       where
+       @groupfd is a file descriptor for a VFIO group;
+       @tablefd is a file descriptor for a TCE table allocated via
+               KVM_CREATE_SPAPR_TCE.
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h

index 0f3ac09..77c6082 100644 (file)
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -188,6 +188,13 @@ struct kvmppc_pginfo {
         atomic_t refcnt;
  };
  
+struct kvmppc_spapr_tce_iommu_table {
+       struct rcu_head rcu;
+       struct list_head next;
+       struct iommu_table *tbl;
+       struct kref kref;
+};
+
  struct kvmppc_spapr_tce_table {
         struct list_head list;
         struct kvm *kvm;
@@ -196,6 +203,7 @@ struct kvmppc_spapr_tce_table {
         u32 page_shift;
         u64 offset;             /* in pages */
         u64 size;               /* window size in pages */
+       struct list_head iommu_tables;
         struct page *pages[0];
  };
  
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h

index 4d079a2..5885d32 100644 (file)
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -173,6 +173,10 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
  extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
                         struct kvm_memory_slot *memslot, unsigned long porder);
  extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
+extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
+               struct iommu_group *grp);
+extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm,
+               struct iommu_group *grp);
  
  extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
                                 struct kvm_create_spapr_tce_64 *args);
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c

index d507d94..a160c14 100644 (file)
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -28,6 +28,8 @@
  #include <linux/hugetlb.h>
  #include <linux/list.h>
  #include <linux/anon_inodes.h>
+#include <linux/iommu.h>
+#include <linux/file.h>
  
  #include <asm/tlbflush.h>
  #include <asm/kvm_ppc.h>
@@ -40,6 +42,7 @@
  #include <asm/udbg.h>
  #include <asm/iommu.h>
  #include <asm/tce.h>
+#include <asm/mmu_context.h>
  
  static unsigned long kvmppc_tce_pages(unsigned long iommu_pages)
  {
@@ -91,6 +94,137 @@ static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
         return ret;
  }
  
+static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head)
+{
+       struct kvmppc_spapr_tce_iommu_table *stit = container_of(head,
+                       struct kvmppc_spapr_tce_iommu_table, rcu);
+
+       iommu_tce_table_put(stit->tbl);
+
+       kfree(stit);
+}
+
+static void kvm_spapr_tce_liobn_put(struct kref *kref)
+{
+       struct kvmppc_spapr_tce_iommu_table *stit = container_of(kref,
+                       struct kvmppc_spapr_tce_iommu_table, kref);
+
+       list_del_rcu(&stit->next);
+
+       call_rcu(&stit->rcu, kvm_spapr_tce_iommu_table_free);
+}
+
+extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm,
+               struct iommu_group *grp)
+{
+       int i;
+       struct kvmppc_spapr_tce_table *stt;
+       struct kvmppc_spapr_tce_iommu_table *stit, *tmp;
+       struct iommu_table_group *table_group = NULL;
+
+       list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) {
+
+               table_group = iommu_group_get_iommudata(grp);
+               if (WARN_ON(!table_group))
+                       continue;
+
+               list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) {
+                       for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+                               if (table_group->tables[i] != stit->tbl)
+                                       continue;
+
+                               kref_put(&stit->kref, kvm_spapr_tce_liobn_put);
+                               return;
+                       }
+               }
+       }
+}
+
+extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
+               struct iommu_group *grp)
+{
+       struct kvmppc_spapr_tce_table *stt = NULL;
+       bool found = false;
+       struct iommu_table *tbl = NULL;
+       struct iommu_table_group *table_group;
+       long i;
+       struct kvmppc_spapr_tce_iommu_table *stit;
+       struct fd f;
+
+       f = fdget(tablefd);
+       if (!f.file)
+               return -EBADF;
+
+       list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) {
+               if (stt == f.file->private_data) {
+                       found = true;
+                       break;
+               }
+       }
+
+       fdput(f);
+
+       if (!found)
+               return -EINVAL;
+
+       table_group = iommu_group_get_iommudata(grp);
+       if (WARN_ON(!table_group))
+               return -EFAULT;
+
+       for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+               struct iommu_table *tbltmp = table_group->tables[i];
+
+               if (!tbltmp)
+                       continue;
+               /*
+                * Make sure hardware table parameters are exactly the same;
+                * this is used in the TCE handlers where boundary checks
+                * use only the first attached table.
+                */
+               if ((tbltmp->it_page_shift == stt->page_shift) &&
+                               (tbltmp->it_offset == stt->offset) &&
+                               (tbltmp->it_size == stt->size)) {
+                       /*
+                        * Reference the table to avoid races with
+                        * add/remove DMA windows.
+                        */
+                       tbl = iommu_tce_table_get(tbltmp);
+                       break;
+               }
+       }
+       if (!tbl)
+               return -EINVAL;
+
+       list_for_each_entry_rcu(stit, &stt->iommu_tables, next) {
+               if (tbl != stit->tbl)
+                       continue;
+
+               if (!kref_get_unless_zero(&stit->kref)) {
+                       /* stit is being destroyed */
+                       iommu_tce_table_put(tbl);
+                       return -ENOTTY;
+               }
+               /*
+                * The table is already known to this KVM, we just increased
+                * its KVM reference counter and can return.
+                */
+               return 0;
+       }
+
+       stit = kzalloc(sizeof(*stit), GFP_KERNEL);
+       if (!stit) {
+               iommu_tce_table_put(tbl);
+               return -ENOMEM;
+       }
+
+       stit->tbl = tbl;
+       kref_init(&stit->kref);
+
+       list_add_rcu(&stit->next, &stt->iommu_tables);
+
+       return 0;
+}
+
  static void release_spapr_tce_table(struct rcu_head *head)
  {
         struct kvmppc_spapr_tce_table *stt = container_of(head,
@@ -130,9 +264,18 @@ static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
  static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
  {
         struct kvmppc_spapr_tce_table *stt = filp->private_data;
+       struct kvmppc_spapr_tce_iommu_table *stit, *tmp;
  
         list_del_rcu(&stt->list);
  
+       list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) {
+               WARN_ON(!kref_read(&stit->kref));
+               while (1) {
+                       if (kref_put(&stit->kref, kvm_spapr_tce_liobn_put))
+                               break;
+               }
+       }
+
         kvm_put_kvm(stt->kvm);
  
         kvmppc_account_memlimit(
@@ -183,6 +326,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
         stt->offset = args->offset;
         stt->size = size;
         stt->kvm = kvm;
+       INIT_LIST_HEAD_RCU(&stt->iommu_tables);
  
         for (i = 0; i < npages; i++) {
                 stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
@@ -211,11 +355,101 @@ fail:
         return ret;
  }
  
+static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry)
+{
+       unsigned long hpa = 0;
+       enum dma_data_direction dir = DMA_NONE;
+
+       iommu_tce_xchg(tbl, entry, &hpa, &dir);
+}
+
+static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
+               struct iommu_table *tbl, unsigned long entry)
+{
+       struct mm_iommu_table_group_mem_t *mem = NULL;
+       const unsigned long pgsize = 1ULL << tbl->it_page_shift;
+       unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+
+       if (!pua)
+               /* it_userspace allocation might be delayed */
+               return H_TOO_HARD;
+
+       mem = mm_iommu_lookup(kvm->mm, *pua, pgsize);
+       if (!mem)
+               return H_TOO_HARD;
+
+       mm_iommu_mapped_dec(mem);
+
+       *pua = 0;
+
+       return H_SUCCESS;
+}
+
+static long kvmppc_tce_iommu_unmap(struct kvm *kvm,
+               struct iommu_table *tbl, unsigned long entry)
+{
+       enum dma_data_direction dir = DMA_NONE;
+       unsigned long hpa = 0;
+       long ret;
+
+       if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir)))
+               return H_HARDWARE;
+
+       if (dir == DMA_NONE)
+               return H_SUCCESS;
+
+       ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry);
+       if (ret != H_SUCCESS)
+               iommu_tce_xchg(tbl, entry, &hpa, &dir);
+
+       return ret;
+}
+
+long kvmppc_tce_iommu_map(struct kvm *kvm, struct iommu_table *tbl,
+               unsigned long entry, unsigned long ua,
+               enum dma_data_direction dir)
+{
+       long ret;
+       unsigned long hpa, *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+       struct mm_iommu_table_group_mem_t *mem;
+
+       if (!pua)
+               /* it_userspace allocation might be delayed */
+               return H_TOO_HARD;
+
+       mem = mm_iommu_lookup(kvm->mm, ua, 1ULL << tbl->it_page_shift);
+       if (!mem)
+               /* This only handles v2 IOMMU type, v1 is handled via ioctl() */
+               return H_TOO_HARD;
+
+       if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, &hpa)))
+               return H_HARDWARE;
+
+       if (mm_iommu_mapped_inc(mem))
+               return H_CLOSED;
+
+       ret = iommu_tce_xchg(tbl, entry, &hpa, &dir);
+       if (WARN_ON_ONCE(ret)) {
+               mm_iommu_mapped_dec(mem);
+               return H_HARDWARE;
+       }
+
+       if (dir != DMA_NONE)
+               kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry);
+
+       *pua = ua;
+
+       return 0;
+}
+
  long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
                       unsigned long ioba, unsigned long tce)
  {
         struct kvmppc_spapr_tce_table *stt;
-       long ret;
+       long ret, idx;
+       struct kvmppc_spapr_tce_iommu_table *stit;
+       unsigned long entry, ua = 0;
+       enum dma_data_direction dir;
  
         /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
         /*          liobn, ioba, tce); */
@@ -232,7 +466,35 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
         if (ret != H_SUCCESS)
                 return ret;
  
-       kvmppc_tce_put(stt, ioba >> stt->page_shift, tce);
+       dir = iommu_tce_direction(tce);
+       if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
+                       tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL))
+               return H_PARAMETER;
+
+       entry = ioba >> stt->page_shift;
+
+       list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+               if (dir == DMA_NONE) {
+                       ret = kvmppc_tce_iommu_unmap(vcpu->kvm,
+                                       stit->tbl, entry);
+               } else {
+                       idx = srcu_read_lock(&vcpu->kvm->srcu);
+                       ret = kvmppc_tce_iommu_map(vcpu->kvm, stit->tbl,
+                                       entry, ua, dir);
+                       srcu_read_unlock(&vcpu->kvm->srcu, idx);
+               }
+
+               if (ret == H_SUCCESS)
+                       continue;
+
+               if (ret == H_TOO_HARD)
+                       return ret;
+
+               WARN_ON_ONCE(1);
+               kvmppc_clear_tce(stit->tbl, entry);
+       }
+
+       kvmppc_tce_put(stt, entry, tce);
  
         return H_SUCCESS;
  }
@@ -247,6 +509,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
         unsigned long entry, ua = 0;
         u64 __user *tces;
         u64 tce;
+       struct kvmppc_spapr_tce_iommu_table *stit;
  
         stt = kvmppc_find_table(vcpu->kvm, liobn);
         if (!stt)
@@ -285,6 +548,26 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
                 if (ret != H_SUCCESS)
                         goto unlock_exit;
  
+               if (kvmppc_gpa_to_ua(vcpu->kvm,
+                               tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
+                               &ua, NULL))
+                       return H_PARAMETER;
+
+               list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+                       ret = kvmppc_tce_iommu_map(vcpu->kvm,
+                                       stit->tbl, entry + i, ua,
+                                       iommu_tce_direction(tce));
+
+                       if (ret == H_SUCCESS)
+                               continue;
+
+                       if (ret == H_TOO_HARD)
+                               goto unlock_exit;
+
+                       WARN_ON_ONCE(1);
+                       kvmppc_clear_tce(stit->tbl, entry);
+               }
+
                 kvmppc_tce_put(stt, entry + i, tce);
         }
  
@@ -301,6 +584,7 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
  {
         struct kvmppc_spapr_tce_table *stt;
         long i, ret;
+       struct kvmppc_spapr_tce_iommu_table *stit;
  
         stt = kvmppc_find_table(vcpu->kvm, liobn);
         if (!stt)
@@ -314,6 +598,24 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
         if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
                 return H_PARAMETER;
  
+       list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+               unsigned long entry = ioba >> stit->tbl->it_page_shift;
+
+               for (i = 0; i < npages; ++i) {
+                       ret = kvmppc_tce_iommu_unmap(vcpu->kvm,
+                                       stit->tbl, entry + i);
+
+                       if (ret == H_SUCCESS)
+                               continue;
+
+                       if (ret == H_TOO_HARD)
+                               return ret;
+
+                       WARN_ON_ONCE(1);
+                       kvmppc_clear_tce(stit->tbl, entry);
+               }
+       }
+
         for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
                 kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
  
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c

index 440d3ab..eda0a8f 100644 (file)
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -40,6 +40,31 @@
  #include <asm/iommu.h>
  #include <asm/tce.h>
  
+#ifdef CONFIG_BUG
+
+#define WARN_ON_ONCE_RM(condition)     ({                      \
+       static bool __section(.data.unlikely) __warned;         \
+       int __ret_warn_once = !!(condition);                    \
+                                                               \
+       if (unlikely(__ret_warn_once && !__warned)) {           \
+               __warned = true;                                \
+               pr_err("WARN_ON_ONCE_RM: (%s) at %s:%u\n",      \
+                               __stringify(condition),         \
+                               __func__, __LINE__);            \
+               dump_stack();                                   \
+       }                                                       \
+       unlikely(__ret_warn_once);                              \
+})
+
+#else
+
+#define WARN_ON_ONCE_RM(condition) ({                          \
+       int __ret_warn_on = !!(condition);                      \
+       unlikely(__ret_warn_on);                                \
+})
+
+#endif
+
  #define TCES_PER_PAGE  (PAGE_SIZE / sizeof(u64))
  
  /*
@@ -161,11 +186,117 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
  EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
  
  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+static void kvmppc_rm_clear_tce(struct iommu_table *tbl, unsigned long entry)
+{
+       unsigned long hpa = 0;
+       enum dma_data_direction dir = DMA_NONE;
+
+       iommu_tce_xchg_rm(tbl, entry, &hpa, &dir);
+}
+
+static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
+               struct iommu_table *tbl, unsigned long entry)
+{
+       struct mm_iommu_table_group_mem_t *mem = NULL;
+       const unsigned long pgsize = 1ULL << tbl->it_page_shift;
+       unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+
+       if (!pua)
+               /* it_userspace allocation might be delayed */
+               return H_TOO_HARD;
+
+       pua = (void *) vmalloc_to_phys(pua);
+       if (WARN_ON_ONCE_RM(!pua))
+               return H_HARDWARE;
+
+       mem = mm_iommu_lookup_rm(kvm->mm, *pua, pgsize);
+       if (!mem)
+               return H_TOO_HARD;
+
+       mm_iommu_mapped_dec(mem);
+
+       *pua = 0;
+
+       return H_SUCCESS;
+}
+
+static long kvmppc_rm_tce_iommu_unmap(struct kvm *kvm,
+               struct iommu_table *tbl, unsigned long entry)
+{
+       enum dma_data_direction dir = DMA_NONE;
+       unsigned long hpa = 0;
+       long ret;
+
+       if (iommu_tce_xchg_rm(tbl, entry, &hpa, &dir))
+               /*
+                * real mode xchg can fail if struct page crosses
+                * a page boundary
+                */
+               return H_TOO_HARD;
+
+       if (dir == DMA_NONE)
+               return H_SUCCESS;
+
+       ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry);
+       if (ret)
+               iommu_tce_xchg_rm(tbl, entry, &hpa, &dir);
+
+       return ret;
+}
+
+static long kvmppc_rm_tce_iommu_map(struct kvm *kvm, struct iommu_table *tbl,
+               unsigned long entry, unsigned long ua,
+               enum dma_data_direction dir)
+{
+       long ret;
+       unsigned long hpa = 0;
+       unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+       struct mm_iommu_table_group_mem_t *mem;
+
+       if (!pua)
+               /* it_userspace allocation might be delayed */
+               return H_TOO_HARD;
+
+       mem = mm_iommu_lookup_rm(kvm->mm, ua, 1ULL << tbl->it_page_shift);
+       if (!mem)
+               return H_TOO_HARD;
+
+       if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, &hpa)))
+               return H_HARDWARE;
+
+       pua = (void *) vmalloc_to_phys(pua);
+       if (WARN_ON_ONCE_RM(!pua))
+               return H_HARDWARE;
+
+       if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem)))
+               return H_CLOSED;
+
+       ret = iommu_tce_xchg_rm(tbl, entry, &hpa, &dir);
+       if (ret) {
+               mm_iommu_mapped_dec(mem);
+               /*
+                * real mode xchg can fail if struct page crosses
+                * a page boundary
+                */
+               return H_TOO_HARD;
+       }
+
+       if (dir != DMA_NONE)
+               kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry);
+
+       *pua = ua;
+
+       return 0;
+}
+
  long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
                 unsigned long ioba, unsigned long tce)
  {
         struct kvmppc_spapr_tce_table *stt;
         long ret;
+       struct kvmppc_spapr_tce_iommu_table *stit;
+       unsigned long entry, ua = 0;
+       enum dma_data_direction dir;
  
         /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
         /*          liobn, ioba, tce); */
@@ -182,7 +313,32 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
         if (ret != H_SUCCESS)
                 return ret;
  
-       kvmppc_tce_put(stt, ioba >> stt->page_shift, tce);
+       dir = iommu_tce_direction(tce);
+       if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
+                       tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL))
+               return H_PARAMETER;
+
+       entry = ioba >> stt->page_shift;
+
+       list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+               if (dir == DMA_NONE)
+                       ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm,
+                                       stit->tbl, entry);
+               else
+                       ret = kvmppc_rm_tce_iommu_map(vcpu->kvm,
+                                       stit->tbl, entry, ua, dir);
+
+               if (ret == H_SUCCESS)
+                       continue;
+
+               if (ret == H_TOO_HARD)
+                       return ret;
+
+               WARN_ON_ONCE_RM(1);
+               kvmppc_rm_clear_tce(stit->tbl, entry);
+       }
+
+       kvmppc_tce_put(stt, entry, tce);
  
         return H_SUCCESS;
  }
@@ -223,6 +379,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
         unsigned long tces, entry, ua = 0;
         unsigned long *rmap = NULL;
         bool prereg = false;
+       struct kvmppc_spapr_tce_iommu_table *stit;
  
         stt = kvmppc_find_table(vcpu->kvm, liobn);
         if (!stt)
@@ -270,6 +427,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
                         return H_TOO_HARD;
  
                 rmap = (void *) vmalloc_to_phys(rmap);
+               if (WARN_ON_ONCE_RM(!rmap))
+                       return H_HARDWARE;
  
                 /*
                  * Synchronize with the MMU notifier callbacks in
@@ -293,6 +452,27 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
                 if (ret != H_SUCCESS)
                         goto unlock_exit;
  
+               ua = 0;
+               if (kvmppc_gpa_to_ua(vcpu->kvm,
+                               tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
+                               &ua, NULL))
+                       return H_PARAMETER;
+
+               list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+                       ret = kvmppc_rm_tce_iommu_map(vcpu->kvm,
+                                       stit->tbl, entry + i, ua,
+                                       iommu_tce_direction(tce));
+
+                       if (ret == H_SUCCESS)
+                               continue;
+
+                       if (ret == H_TOO_HARD)
+                               goto unlock_exit;
+
+                       WARN_ON_ONCE_RM(1);
+                       kvmppc_rm_clear_tce(stit->tbl, entry);
+               }
+
                 kvmppc_tce_put(stt, entry + i, tce);
         }
  
@@ -309,6 +489,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
  {
         struct kvmppc_spapr_tce_table *stt;
         long i, ret;
+       struct kvmppc_spapr_tce_iommu_table *stit;
  
         stt = kvmppc_find_table(vcpu->kvm, liobn);
         if (!stt)
@@ -322,6 +503,24 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
         if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
                 return H_PARAMETER;
  
+       list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+               unsigned long entry = ioba >> stit->tbl->it_page_shift;
+
+               for (i = 0; i < npages; ++i) {
+                       ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm,
+                                       stit->tbl, entry + i);
+
+                       if (ret == H_SUCCESS)
+                               continue;
+
+                       if (ret == H_TOO_HARD)
+                               return ret;
+
+                       WARN_ON_ONCE_RM(1);
+                       kvmppc_rm_clear_tce(stit->tbl, entry);
+               }
+       }
+
         for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
                 kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
  
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c

index 6c72448..cf725c5 100644 (file)
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -534,6 +534,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
  #ifdef CONFIG_PPC_BOOK3S_64
         case KVM_CAP_SPAPR_TCE:
         case KVM_CAP_SPAPR_TCE_64:
+               /* fallthrough */
+       case KVM_CAP_SPAPR_TCE_VFIO:
         case KVM_CAP_PPC_RTAS:
         case KVM_CAP_PPC_FIXUP_HCALL:
         case KVM_CAP_PPC_ENABLE_HCALL:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h

index 7b488ea..3c168b6 100644 (file)
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1097,6 +1097,7 @@ struct kvm_device_attr {
  #define  KVM_DEV_VFIO_GROUP                    1
  #define   KVM_DEV_VFIO_GROUP_ADD                       1
  #define   KVM_DEV_VFIO_GROUP_DEL                       2
+#define   KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE             3
  
  enum kvm_device_type {
         KVM_DEV_TYPE_FSL_MPIC_20        = 1,
@@ -1118,6 +1119,11 @@ enum kvm_device_type {
         KVM_DEV_TYPE_MAX,
  };
  
+struct kvm_vfio_spapr_tce {
+       __s32   groupfd;
+       __s32   tablefd;
+};
+
  /*
   * ioctls for VM fds
   */
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c

index d32f239..37d9118 100644 (file)
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -20,6 +20,10 @@
  #include <linux/vfio.h>
  #include "vfio.h"
  
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+#include <asm/kvm_ppc.h>
+#endif
+
  struct kvm_vfio_group {
         struct list_head node;
         struct vfio_group *vfio_group;
@@ -89,6 +93,47 @@ static bool kvm_vfio_group_is_coherent(struct vfio_group *vfio_group)
         return ret > 0;
  }
  
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+static int kvm_vfio_external_user_iommu_id(struct vfio_group *vfio_group)
+{
+       int (*fn)(struct vfio_group *);
+       int ret = -EINVAL;
+
+       fn = symbol_get(vfio_external_user_iommu_id);
+       if (!fn)
+               return ret;
+
+       ret = fn(vfio_group);
+
+       symbol_put(vfio_external_user_iommu_id);
+
+       return ret;
+}
+
+static struct iommu_group *kvm_vfio_group_get_iommu_group(
+               struct vfio_group *group)
+{
+       int group_id = kvm_vfio_external_user_iommu_id(group);
+
+       if (group_id < 0)
+               return NULL;
+
+       return iommu_group_get_by_id(group_id);
+}
+
+static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm,
+               struct vfio_group *vfio_group)
+{
+       struct iommu_group *grp = kvm_vfio_group_get_iommu_group(vfio_group);
+
+       if (WARN_ON_ONCE(!grp))
+               return;
+
+       kvm_spapr_tce_release_iommu_group(kvm, grp);
+       iommu_group_put(grp);
+}
+#endif
+
  /*
   * Groups can use the same or different IOMMU domains.  If the same then
   * adding a new group may change the coherency of groups we've previously
@@ -211,6 +256,9 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
  
                 mutex_unlock(&kv->lock);
  
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+               kvm_spapr_tce_release_vfio_group(dev->kvm, vfio_group);
+#endif
                 kvm_vfio_group_set_kvm(vfio_group, NULL);
  
                 kvm_vfio_group_put_external_user(vfio_group);
@@ -218,6 +266,57 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
                 kvm_vfio_update_coherency(dev);
  
                 return ret;
+
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+       case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: {
+               struct kvm_vfio_spapr_tce param;
+               struct kvm_vfio *kv = dev->private;
+               struct vfio_group *vfio_group;
+               struct kvm_vfio_group *kvg;
+               struct fd f;
+               struct iommu_group *grp;
+
+               if (copy_from_user(&param, (void __user *)arg,
+                               sizeof(struct kvm_vfio_spapr_tce)))
+                       return -EFAULT;
+
+               f = fdget(param.groupfd);
+               if (!f.file)
+                       return -EBADF;
+
+               vfio_group = kvm_vfio_group_get_external_user(f.file);
+               fdput(f);
+
+               if (IS_ERR(vfio_group))
+                       return PTR_ERR(vfio_group);
+
+               grp = kvm_vfio_group_get_iommu_group(vfio_group);
+               if (WARN_ON_ONCE(!grp)) {
+                       kvm_vfio_group_put_external_user(vfio_group);
+                       return -EIO;
+               }
+
+               ret = -ENOENT;
+
+               mutex_lock(&kv->lock);
+
+               list_for_each_entry(kvg, &kv->group_list, node) {
+                       if (kvg->vfio_group != vfio_group)
+                               continue;
+
+                       ret = kvm_spapr_tce_attach_iommu_group(dev->kvm,
+                                       param.tablefd, grp);
+                       break;
+               }
+
+               mutex_unlock(&kv->lock);
+
+               iommu_group_put(grp);
+               kvm_vfio_group_put_external_user(vfio_group);
+
+               return ret;
+       }
+#endif /* CONFIG_SPAPR_TCE_IOMMU */
         }
  
         return -ENXIO;
@@ -242,6 +341,9 @@ static int kvm_vfio_has_attr(struct kvm_device *dev,
                 switch (attr->attr) {
                 case KVM_DEV_VFIO_GROUP_ADD:
                 case KVM_DEV_VFIO_GROUP_DEL:
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+               case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE:
+#endif
                         return 0;
                 }
  
@@ -257,6 +359,9 @@ static void kvm_vfio_destroy(struct kvm_device *dev)
         struct kvm_vfio_group *kvg, *tmp;
  
         list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) {
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+               kvm_spapr_tce_release_vfio_group(dev->kvm, kvg->vfio_group);
+#endif
                 kvm_vfio_group_set_kvm(kvg->vfio_group, NULL);
                 kvm_vfio_group_put_external_user(kvg->vfio_group);
                 list_del(&kvg->node);
author	Alexey Kardashevskiy <aik@ozlabs.ru>
	Wed, 22 Mar 2017 04:21:56 +0000 (15:21 +1100)
committer	Paul Mackerras <paulus@ozlabs.org>
	Thu, 20 Apr 2017 01:39:26 +0000 (11:39 +1000)
Documentation/virtual/kvm/devices/vfio.txt		patch \| blob \| history
arch/powerpc/include/asm/kvm_host.h		patch \| blob \| history
arch/powerpc/include/asm/kvm_ppc.h		patch \| blob \| history
arch/powerpc/kvm/book3s_64_vio.c		patch \| blob \| history
arch/powerpc/kvm/book3s_64_vio_hv.c		patch \| blob \| history
arch/powerpc/kvm/powerpc.c		patch \| blob \| history
include/uapi/linux/kvm.h		patch \| blob \| history
virt/kvm/vfio.c		patch \| blob \| history