drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c

   1 /*
   2  * Copyright 2019 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  *
  22  */
  23
  24 #include "amdgpu.h"
  25 #include "umc_v6_7.h"
  26
  27 static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
  28                                     struct ras_err_data *err_data, uint64_t err_addr,
  29                                     uint32_t ch_inst, uint32_t umc_inst)
  30 {
  31         switch (adev->ip_versions[UMC_HWIP][0]) {
  32         case IP_VERSION(6, 7, 0):
  33                 umc_v6_7_convert_error_address(adev,
  34                                 err_data, err_addr, ch_inst, umc_inst);
  35                 break;
  36         default:
  37                 dev_warn(adev->dev,
  38                          "UMC address to Physical address translation is not supported\n");
  39                 return AMDGPU_RAS_FAIL;
  40         }
  41
  42         return AMDGPU_RAS_SUCCESS;
  43 }
  44
  45 int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
  46                         uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst)
  47 {
  48         struct ras_err_data err_data = {0, 0, 0, NULL};
  49         int ret = AMDGPU_RAS_FAIL;
  50
  51         err_data.err_addr =
  52                 kcalloc(adev->umc.max_ras_err_cnt_per_query,
  53                         sizeof(struct eeprom_table_record), GFP_KERNEL);
  54         if (!err_data.err_addr) {
  55                 dev_warn(adev->dev,
  56                         "Failed to alloc memory for umc error record in MCA notifier!\n");
  57                 return AMDGPU_RAS_FAIL;
  58         }
  59
  60         /*
  61          * Translate UMC channel address to Physical address
  62          */
  63         ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
  64                                         ch_inst, umc_inst);
  65         if (ret)
  66                 goto out;
  67
  68         if (amdgpu_bad_page_threshold != 0) {
  69                 amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
  70                                                 err_data.err_addr_cnt);
  71                 amdgpu_ras_save_bad_pages(adev);
  72         }
  73
  74 out:
  75         kfree(err_data.err_addr);
  76         return ret;
  77 }
  78
  79 static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
  80                 void *ras_error_status,
  81                 struct amdgpu_iv_entry *entry,
  82                 bool reset)
  83 {
  84         struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
  85         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
  86         int ret = 0;
  87
  88         kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
  89         ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
  90         if (ret == -EOPNOTSUPP) {
  91                 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
  92                     adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
  93                     adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status);
  94
  95                 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
  96                     adev->umc.ras->ras_block.hw_ops->query_ras_error_address &&
  97                     adev->umc.max_ras_err_cnt_per_query) {
  98                         err_data->err_addr =
  99                                 kcalloc(adev->umc.max_ras_err_cnt_per_query,
 100                                         sizeof(struct eeprom_table_record), GFP_KERNEL);
 101
 102                         /* still call query_ras_error_address to clear error status
 103                          * even NOMEM error is encountered
 104                          */
 105                         if(!err_data->err_addr)
 106                                 dev_warn(adev->dev, "Failed to alloc memory for "
 107                                                 "umc error address record!\n");
 108
 109                         /* umc query_ras_error_address is also responsible for clearing
 110                          * error status
 111                          */
 112                         adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, ras_error_status);
 113                 }
 114         } else if (!ret) {
 115                 if (adev->umc.ras &&
 116                     adev->umc.ras->ecc_info_query_ras_error_count)
 117                     adev->umc.ras->ecc_info_query_ras_error_count(adev, ras_error_status);
 118
 119                 if (adev->umc.ras &&
 120                     adev->umc.ras->ecc_info_query_ras_error_address &&
 121                     adev->umc.max_ras_err_cnt_per_query) {
 122                         err_data->err_addr =
 123                                 kcalloc(adev->umc.max_ras_err_cnt_per_query,
 124                                         sizeof(struct eeprom_table_record), GFP_KERNEL);
 125
 126                         /* still call query_ras_error_address to clear error status
 127                          * even NOMEM error is encountered
 128                          */
 129                         if(!err_data->err_addr)
 130                                 dev_warn(adev->dev, "Failed to alloc memory for "
 131                                                 "umc error address record!\n");
 132
 133                         /* umc query_ras_error_address is also responsible for clearing
 134                          * error status
 135                          */
 136                         adev->umc.ras->ecc_info_query_ras_error_address(adev, ras_error_status);
 137                 }
 138         }
 139
 140         /* only uncorrectable error needs gpu reset */
 141         if (err_data->ue_count) {
 142                 dev_info(adev->dev, "%ld uncorrectable hardware errors "
 143                                 "detected in UMC block\n",
 144                                 err_data->ue_count);
 145
 146                 if ((amdgpu_bad_page_threshold != 0) &&
 147                         err_data->err_addr_cnt) {
 148                         amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
 149                                                 err_data->err_addr_cnt);
 150                         amdgpu_ras_save_bad_pages(adev);
 151
 152                         amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
 153
 154                         if (con->update_channel_flag == true) {
 155                                 amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
 156                                 con->update_channel_flag = false;
 157                         }
 158                 }
 159
 160                 if (reset)
 161                         amdgpu_ras_reset_gpu(adev);
 162         }
 163
 164         kfree(err_data->err_addr);
 165         return AMDGPU_RAS_SUCCESS;
 166 }
 167
 168 int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
 169 {
 170         int ret = AMDGPU_RAS_SUCCESS;
 171
 172         if (!amdgpu_sriov_vf(adev)) {
 173                 if (!adev->gmc.xgmi.connected_to_cpu) {
 174                         struct ras_err_data err_data = {0, 0, 0, NULL};
 175                         struct ras_common_if head = {
 176                                 .block = AMDGPU_RAS_BLOCK__UMC,
 177                         };
 178                         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
 179
 180                         ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
 181
 182                         if (ret == AMDGPU_RAS_SUCCESS && obj) {
 183                                 obj->err_data.ue_count += err_data.ue_count;
 184                                 obj->err_data.ce_count += err_data.ce_count;
 185                         }
 186                 } else if (reset) {
 187                         /* MCA poison handler is only responsible for GPU reset,
 188                          * let MCA notifier do page retirement.
 189                          */
 190                         kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
 191                         amdgpu_ras_reset_gpu(adev);
 192                 }
 193         } else {
 194                 if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
 195                         adev->virt.ops->ras_poison_handler(adev);
 196                 else
 197                         dev_warn(adev->dev,
 198                                 "No ras_poison_handler interface in SRIOV!\n");
 199         }
 200
 201         return ret;
 202 }
 203
 204 int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
 205                 void *ras_error_status,
 206                 struct amdgpu_iv_entry *entry)
 207 {
 208         return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
 209 }
 210
 211 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
 212 {
 213         int r;
 214
 215         r = amdgpu_ras_block_late_init(adev, ras_block);
 216         if (r)
 217                 return r;
 218
 219         if (amdgpu_ras_is_supported(adev, ras_block->block)) {
 220                 r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
 221                 if (r)
 222                         goto late_fini;
 223         }
 224
 225         /* ras init of specific umc version */
 226         if (adev->umc.ras &&
 227             adev->umc.ras->err_cnt_init)
 228                 adev->umc.ras->err_cnt_init(adev);
 229
 230         return 0;
 231
 232 late_fini:
 233         amdgpu_ras_block_late_fini(adev, ras_block);
 234         return r;
 235 }
 236
 237 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
 238                 struct amdgpu_irq_src *source,
 239                 struct amdgpu_iv_entry *entry)
 240 {
 241         struct ras_common_if *ras_if = adev->umc.ras_if;
 242         struct ras_dispatch_if ih_data = {
 243                 .entry = entry,
 244         };
 245
 246         if (!ras_if)
 247                 return 0;
 248
 249         ih_data.head = *ras_if;
 250
 251         amdgpu_ras_interrupt_dispatch(adev, &ih_data);
 252         return 0;
 253 }
 254
 255 void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
 256                 uint64_t err_addr,
 257                 uint64_t retired_page,
 258                 uint32_t channel_index,
 259                 uint32_t umc_inst)
 260 {
 261         struct eeprom_table_record *err_rec =
 262                 &err_data->err_addr[err_data->err_addr_cnt];
 263
 264         err_rec->address = err_addr;
 265         /* page frame address is saved */
 266         err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
 267         err_rec->ts = (uint64_t)ktime_get_real_seconds();
 268         err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
 269         err_rec->cu = 0;
 270         err_rec->mem_channel = channel_index;
 271         err_rec->mcumc_id = umc_inst;
 272
 273         err_data->err_addr_cnt++;
 274 }