drm/amdkfd: report xgmi bandwidth between direct peers to the kfd
authorJonathan Kim <jonathan.kim@amd.com>
Wed, 12 May 2021 16:30:41 +0000 (12:30 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 23 Jul 2021 14:07:59 +0000 (10:07 -0400)
Report the min/max bandwidth in megabytes to the kfd for direct
xgmi connections only.  Indirect peers will report 0 since
indirect route is unknown.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
drivers/gpu/drm/amd/amdkfd/kfd_crat.c

index f9c01bd..801403d 100644 (file)
@@ -553,6 +553,30 @@ uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev *s
        return  (uint8_t)ret;
 }
 
+int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct kgd_dev *dst, struct kgd_dev *src, bool is_min)
+{
+       struct amdgpu_device *adev = (struct amdgpu_device *)dst, *peer_adev;
+       int num_links;
+
+       if (adev->asic_type != CHIP_ALDEBARAN)
+               return 0;
+
+       if (src)
+               peer_adev = (struct amdgpu_device *)src;
+
+       /* num links returns 0 for indirect peers since indirect route is unknown. */
+       num_links = is_min ? 1 : amdgpu_xgmi_get_num_links(adev, peer_adev);
+       if (num_links < 0) {
+               DRM_ERROR("amdgpu: failed to get xgmi num links between node %d and %d. ret = %d\n",
+                       adev->gmc.xgmi.physical_node_id,
+                       peer_adev->gmc.xgmi.physical_node_id, num_links);
+               num_links = 0;
+       }
+
+       /* Aldebaran xGMI DPM is defeatured so assume x16 x 25Gbps for bandwidth. */
+       return (num_links * 16 * 25000)/BITS_PER_BYTE;
+}
+
 uint64_t amdgpu_amdkfd_get_mmio_remap_phys_addr(struct kgd_dev *kgd)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
index cf62f43..9b98ce2 100644 (file)
@@ -226,6 +226,7 @@ uint32_t amdgpu_amdkfd_get_num_gws(struct kgd_dev *kgd);
 uint32_t amdgpu_amdkfd_get_asic_rev_id(struct kgd_dev *kgd);
 int amdgpu_amdkfd_get_noretry(struct kgd_dev *kgd);
 uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev *src);
+int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct kgd_dev *dst, struct kgd_dev *src, bool is_min);
 
 /* Read user wptr from a specified user address space with page fault
  * disabled. The memory must be pinned and mapped to the hardware when
index 8567d5d..258cf86 100644 (file)
@@ -486,6 +486,18 @@ int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
        return  -EINVAL;
 }
 
+int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
+               struct amdgpu_device *peer_adev)
+{
+       struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
+       int i;
+
+       for (i = 0 ; i < top->num_nodes; ++i)
+               if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
+                       return top->nodes[i].num_links;
+       return  -EINVAL;
+}
+
 int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 {
        struct psp_xgmi_topology_info *top_info;
index 12969c0..d2189bf 100644 (file)
@@ -59,6 +59,8 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
 int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate);
 int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
                struct amdgpu_device *peer_adev);
+int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
+               struct amdgpu_device *peer_adev);
 uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
                                           uint64_t addr);
 static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
index c6b02ae..40ce623 100644 (file)
@@ -1989,6 +1989,13 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size,
                sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
                sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;
                sub_type_hdr->num_hops_xgmi = 1;
+               if (adev->asic_type == CHIP_ALDEBARAN) {
+                       sub_type_hdr->minimum_bandwidth_mbs =
+                                       amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(
+                                                       kdev->kgd, NULL, true);
+                       sub_type_hdr->maximum_bandwidth_mbs =
+                                       sub_type_hdr->minimum_bandwidth_mbs;
+               }
        } else {
                sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS;
        }
@@ -2033,6 +2040,11 @@ static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size,
        sub_type_hdr->proximity_domain_to = proximity_domain_to;
        sub_type_hdr->num_hops_xgmi =
                amdgpu_amdkfd_get_xgmi_hops_count(kdev->kgd, peer_kdev->kgd);
+       sub_type_hdr->maximum_bandwidth_mbs =
+               amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->kgd, peer_kdev->kgd, false);
+       sub_type_hdr->minimum_bandwidth_mbs = sub_type_hdr->maximum_bandwidth_mbs ?
+               amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->kgd, NULL, true) : 0;
+
        return 0;
 }