drm/amdgpu: Add driver infrastructure for MCA RAS
authorJohn Clements <john.clements@amd.com>
Tue, 24 Aug 2021 05:24:25 +0000 (13:24 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 24 Aug 2021 19:36:18 +0000 (15:36 -0400)
Add MCA specific IP blocks targetting RAS features

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: John Clements <john.clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/Makefile
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c [new file with mode: 0644]
drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h [new file with mode: 0644]
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
drivers/gpu/drm/amd/amdgpu/mca_v3_0.c [new file with mode: 0644]
drivers/gpu/drm/amd/amdgpu/mca_v3_0.h [new file with mode: 0644]

index 0d814c9..8d07481 100644 (file)
@@ -58,7 +58,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
        amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
        amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
        amdgpu_fw_attestation.o amdgpu_securedisplay.o amdgpu_hdp.o \
-       amdgpu_eeprom.o
+       amdgpu_eeprom.o amdgpu_mca.o
 
 amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
 
@@ -189,6 +189,10 @@ amdgpu-y += \
 amdgpu-y += \
        amdgpu_reset.o
 
+# add MCA block
+amdgpu-y += \
+       mca_v3_0.o
+
 # add amdkfd interfaces
 amdgpu-y += amdgpu_amdkfd.o
 
index 0f278cc..dc3c6b3 100644 (file)
 #include "amdgpu_df.h"
 #include "amdgpu_smuio.h"
 #include "amdgpu_fdinfo.h"
+#include "amdgpu_mca.h"
 
 #define MAX_GPU_INSTANCE               16
 
@@ -1009,6 +1010,9 @@ struct amdgpu_device {
        /* df */
        struct amdgpu_df                df;
 
+       /* MCA */
+       struct amdgpu_mca               mca;
+
        struct amdgpu_ip_block          ip_blocks[AMDGPU_MAX_IP_NUM];
        uint32_t                        harvest_ip_mask;
        int                             num_ip_blocks;
index d0b8d41..c7797ea 100644 (file)
@@ -471,6 +471,27 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
                        return r;
        }
 
+       if (adev->mca.mp0.ras_funcs &&
+           adev->mca.mp0.ras_funcs->ras_late_init) {
+               r = adev->mca.mp0.ras_funcs->ras_late_init(adev);
+               if (r)
+                       return r;
+       }
+
+       if (adev->mca.mp1.ras_funcs &&
+           adev->mca.mp1.ras_funcs->ras_late_init) {
+               r = adev->mca.mp1.ras_funcs->ras_late_init(adev);
+               if (r)
+                       return r;
+       }
+
+       if (adev->mca.mpio.ras_funcs &&
+           adev->mca.mpio.ras_funcs->ras_late_init) {
+               r = adev->mca.mpio.ras_funcs->ras_late_init(adev);
+               if (r)
+                       return r;
+       }
+
        return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
new file mode 100644 (file)
index 0000000..a2d3dbb
--- /dev/null
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "amdgpu_ras.h"
+#include "amdgpu.h"
+#include "amdgpu_mca.h"
+
+#include "umc/umc_6_7_0_offset.h"
+#include "umc/umc_6_7_0_sh_mask.h"
+
+void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
+                                             uint64_t mc_status_addr,
+                                             unsigned long *error_count)
+{
+       uint64_t mc_status = RREG64_PCIE(mc_status_addr * 4);
+
+       if (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
+           REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
+               *error_count += 1;
+}
+
+void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev,
+                                               uint64_t mc_status_addr,
+                                               unsigned long *error_count)
+{
+       uint64_t mc_status = RREG64_PCIE(mc_status_addr * 4);
+
+       if ((REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
+           (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
+           REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
+           REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
+           REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
+           REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
+               *error_count += 1;
+}
+
+void amdgpu_mca_reset_error_count(struct amdgpu_device *adev,
+                                 uint64_t mc_status_addr)
+{
+       WREG64_PCIE(mc_status_addr * 4, 0x0ULL);
+}
+
+void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev,
+                                     uint64_t mc_status_addr,
+                                     void *ras_error_status)
+{
+       struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+
+       amdgpu_mca_query_correctable_error_count(adev, mc_status_addr, &(err_data->ce_count));
+       amdgpu_mca_query_uncorrectable_error_count(adev, mc_status_addr, &(err_data->ue_count));
+
+       amdgpu_mca_reset_error_count(adev, mc_status_addr);
+}
+
+int amdgpu_mca_ras_late_init(struct amdgpu_device *adev,
+                            struct amdgpu_mca_ras *mca_dev)
+{
+       int r;
+       struct ras_ih_if ih_info = {
+               .cb = NULL,
+       };
+       struct ras_fs_if fs_info = {
+               .sysfs_name = mca_dev->ras_funcs->sysfs_name,
+       };
+
+       if (!mca_dev->ras_if) {
+               mca_dev->ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
+               if (!mca_dev->ras_if)
+                       return -ENOMEM;
+               mca_dev->ras_if->block = mca_dev->ras_funcs->ras_block;
+               mca_dev->ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+               mca_dev->ras_if->sub_block_index = 0;
+       }
+       ih_info.head = fs_info.head = *mca_dev->ras_if;
+       r = amdgpu_ras_late_init(adev, mca_dev->ras_if,
+                                &fs_info, &ih_info);
+       if (r || !amdgpu_ras_is_supported(adev, mca_dev->ras_if->block)) {
+               kfree(mca_dev->ras_if);
+               mca_dev->ras_if = NULL;
+       }
+
+       return r;
+}
+
+void amdgpu_mca_ras_fini(struct amdgpu_device *adev,
+                        struct amdgpu_mca_ras *mca_dev)
+{
+       struct ras_ih_if ih_info = {
+               .cb = NULL,
+       };
+
+       if (!mca_dev->ras_if)
+               return;
+
+       amdgpu_ras_late_fini(adev, mca_dev->ras_if, &ih_info);
+       kfree(mca_dev->ras_if);
+       mca_dev->ras_if = NULL;
+}
\ No newline at end of file
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
new file mode 100644 (file)
index 0000000..f860f2f
--- /dev/null
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2021  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef __AMDGPU_MCA_H__
+#define __AMDGPU_MCA_H__
+
+struct amdgpu_mca_ras_funcs {
+       int (*ras_late_init)(struct amdgpu_device *adev);
+       void (*ras_fini)(struct amdgpu_device *adev);
+       void (*query_ras_error_count)(struct amdgpu_device *adev,
+                                     void *ras_error_status);
+       void (*query_ras_error_address)(struct amdgpu_device *adev,
+                                       void *ras_error_status);
+       uint32_t ras_block;
+       const char* sysfs_name;
+};
+
+struct amdgpu_mca_ras {
+       struct ras_common_if *ras_if;
+       const struct amdgpu_mca_ras_funcs *ras_funcs;
+};
+
+struct amdgpu_mca_funcs {
+       void (*init)(struct amdgpu_device *adev);
+};
+
+struct amdgpu_mca {
+       const struct amdgpu_mca_funcs *funcs;
+       struct amdgpu_mca_ras mp0;
+       struct amdgpu_mca_ras mp1;
+       struct amdgpu_mca_ras mpio;
+};
+
+void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
+                                             uint64_t mc_status_addr,
+                                             unsigned long *error_count);
+
+void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev,
+                                               uint64_t mc_status_addr,
+                                               unsigned long *error_count);
+
+void amdgpu_mca_reset_error_count(struct amdgpu_device *adev,
+                                 uint64_t mc_status_addr);
+
+void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev,
+                                     uint64_t mc_status_addr,
+                                     void *ras_error_status);
+
+int amdgpu_mca_ras_late_init(struct amdgpu_device *adev,
+                            struct amdgpu_mca_ras *mca_dev);
+
+void amdgpu_mca_ras_fini(struct amdgpu_device *adev,
+                        struct amdgpu_mca_ras *mca_dev);
+
+#endif
index 5b51633..eae604f 100644 (file)
@@ -49,6 +49,7 @@ enum amdgpu_ras_block {
        AMDGPU_RAS_BLOCK__MP0,
        AMDGPU_RAS_BLOCK__MP1,
        AMDGPU_RAS_BLOCK__FUSE,
+       AMDGPU_RAS_BLOCK__MPIO,
 
        AMDGPU_RAS_BLOCK__LAST
 };
@@ -420,7 +421,7 @@ struct ras_badpage {
 /* interfaces for IP */
 struct ras_fs_if {
        struct ras_common_if head;
-       char sysfs_name[32];
+       const char* sysfs_name;
        char debugfs_name[32];
 };
 
index 097230b..085fab4 100644 (file)
@@ -55,6 +55,7 @@
 #include "umc_v6_0.h"
 #include "umc_v6_7.h"
 #include "hdp_v4_0.h"
+#include "mca_v3_0.h"
 
 #include "ivsrcid/vmc/irqsrcs_vmc_1_0.h"
 
@@ -1229,6 +1230,18 @@ static void gmc_v9_0_set_hdp_ras_funcs(struct amdgpu_device *adev)
        adev->hdp.ras_funcs = &hdp_v4_0_ras_funcs;
 }
 
+static void gmc_v9_0_set_mca_funcs(struct amdgpu_device *adev)
+{
+       switch (adev->asic_type) {
+       case CHIP_ALDEBARAN:
+               if (!adev->gmc.xgmi.connected_to_cpu)
+                       adev->mca.funcs = &mca_v3_0_funcs;
+               break;
+       default:
+               break;
+       }
+}
+
 static int gmc_v9_0_early_init(void *handle)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -1250,6 +1263,7 @@ static int gmc_v9_0_early_init(void *handle)
        gmc_v9_0_set_mmhub_ras_funcs(adev);
        gmc_v9_0_set_gfxhub_funcs(adev);
        gmc_v9_0_set_hdp_ras_funcs(adev);
+       gmc_v9_0_set_mca_funcs(adev);
 
        adev->gmc.shared_aperture_start = 0x2000000000000000ULL;
        adev->gmc.shared_aperture_end =
@@ -1461,6 +1475,8 @@ static int gmc_v9_0_sw_init(void *handle)
        adev->gfxhub.funcs->init(adev);
 
        adev->mmhub.funcs->init(adev);
+       if (adev->mca.funcs)
+               adev->mca.funcs->init(adev);
 
        spin_lock_init(&adev->gmc.invalidate_lock);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
new file mode 100644 (file)
index 0000000..058b657
--- /dev/null
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "amdgpu_ras.h"
+#include "amdgpu.h"
+#include "amdgpu_mca.h"
+
+#define smnMCMP0_STATUST0      0x03830408
+#define smnMCMP1_STATUST0      0x03b30408
+#define smnMCMPIO_STATUST0     0x0c930408
+
+
+static void mca_v3_0_mp0_query_ras_error_count(struct amdgpu_device *adev,
+                                              void *ras_error_status)
+{
+       amdgpu_mca_query_ras_error_count(adev,
+                                        smnMCMP0_STATUST0,
+                                        ras_error_status);
+}
+
+static int mca_v3_0_mp0_ras_late_init(struct amdgpu_device *adev)
+{
+       return amdgpu_mca_ras_late_init(adev, &adev->mca.mp0);
+}
+
+static void mca_v3_0_mp0_ras_fini(struct amdgpu_device *adev)
+{
+       amdgpu_mca_ras_fini(adev, &adev->mca.mp0);
+}
+
+const struct amdgpu_mca_ras_funcs mca_v3_0_mp0_ras_funcs = {
+       .ras_late_init = mca_v3_0_mp0_ras_late_init,
+       .ras_fini = mca_v3_0_mp0_ras_fini,
+       .query_ras_error_count = mca_v3_0_mp0_query_ras_error_count,
+       .query_ras_error_address = NULL,
+       .ras_block = AMDGPU_RAS_BLOCK__MP0,
+       .sysfs_name = "mp0_err_count",
+};
+
+static void mca_v3_0_mp1_query_ras_error_count(struct amdgpu_device *adev,
+                                              void *ras_error_status)
+{
+       amdgpu_mca_query_ras_error_count(adev,
+                                        smnMCMP1_STATUST0,
+                                        ras_error_status);
+}
+
+static int mca_v3_0_mp1_ras_late_init(struct amdgpu_device *adev)
+{
+       return amdgpu_mca_ras_late_init(adev, &adev->mca.mp1);
+}
+
+static void mca_v3_0_mp1_ras_fini(struct amdgpu_device *adev)
+{
+       amdgpu_mca_ras_fini(adev, &adev->mca.mp1);
+}
+
+const struct amdgpu_mca_ras_funcs mca_v3_0_mp1_ras_funcs = {
+       .ras_late_init = mca_v3_0_mp1_ras_late_init,
+       .ras_fini = mca_v3_0_mp1_ras_fini,
+       .query_ras_error_count = mca_v3_0_mp1_query_ras_error_count,
+       .query_ras_error_address = NULL,
+       .ras_block = AMDGPU_RAS_BLOCK__MP1,
+       .sysfs_name = "mp1_err_count",
+};
+
+static void mca_v3_0_mpio_query_ras_error_count(struct amdgpu_device *adev,
+                                              void *ras_error_status)
+{
+       amdgpu_mca_query_ras_error_count(adev,
+                                        smnMCMPIO_STATUST0,
+                                        ras_error_status);
+}
+
+static int mca_v3_0_mpio_ras_late_init(struct amdgpu_device *adev)
+{
+       return amdgpu_mca_ras_late_init(adev, &adev->mca.mpio);
+}
+
+static void mca_v3_0_mpio_ras_fini(struct amdgpu_device *adev)
+{
+       amdgpu_mca_ras_fini(adev, &adev->mca.mpio);
+}
+
+const struct amdgpu_mca_ras_funcs mca_v3_0_mpio_ras_funcs = {
+       .ras_late_init = mca_v3_0_mpio_ras_late_init,
+       .ras_fini = mca_v3_0_mpio_ras_fini,
+       .query_ras_error_count = mca_v3_0_mpio_query_ras_error_count,
+       .query_ras_error_address = NULL,
+       .ras_block = AMDGPU_RAS_BLOCK__MPIO,
+       .sysfs_name = "mpio_err_count",
+};
+
+
+static void mca_v3_0_init(struct amdgpu_device *adev)
+{
+       struct amdgpu_mca *mca = &adev->mca;
+
+       mca->mp0.ras_funcs = &mca_v3_0_mp0_ras_funcs;
+       mca->mp1.ras_funcs = &mca_v3_0_mp1_ras_funcs;
+       mca->mpio.ras_funcs = &mca_v3_0_mpio_ras_funcs;
+}
+
+const struct amdgpu_mca_funcs mca_v3_0_funcs = {
+       .init = mca_v3_0_init,
+};
\ No newline at end of file
diff --git a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.h b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.h
new file mode 100644 (file)
index 0000000..b899b86
--- /dev/null
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) 2021  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef __MCA_V3_0_H__
+#define __MCA_V3_0_H__
+
+extern const struct amdgpu_mca_funcs mca_v3_0_funcs;
+
+#endif