drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c

   1 /*
   2  * Copyright 2019 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  *
  22  */
  23
  24 #include "amdgpu.h"
  25 #include "umc_v6_7.h"
  26
  27 static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
  28                                     struct ras_err_data *err_data, uint64_t err_addr,
  29                                     uint32_t ch_inst, uint32_t umc_inst)
  30 {
  31         switch (adev->ip_versions[UMC_HWIP][0]) {
  32         case IP_VERSION(6, 7, 0):
  33                 umc_v6_7_convert_error_address(adev,
  34                                 err_data, err_addr, ch_inst, umc_inst);
  35                 break;
  36         default:
  37                 dev_warn(adev->dev,
  38                          "UMC address to Physical address translation is not supported\n");
  39                 return AMDGPU_RAS_FAIL;
  40         }
  41
  42         return AMDGPU_RAS_SUCCESS;
  43 }
  44
  45 int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
  46                         uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst)
  47 {
  48         struct ras_err_data err_data = {0, 0, 0, NULL};
  49         int ret = AMDGPU_RAS_FAIL;
  50
  51         err_data.err_addr =
  52                 kcalloc(adev->umc.max_ras_err_cnt_per_query,
  53                         sizeof(struct eeprom_table_record), GFP_KERNEL);
  54         if (!err_data.err_addr) {
  55                 dev_warn(adev->dev,
  56                         "Failed to alloc memory for umc error record in MCA notifier!\n");
  57                 return AMDGPU_RAS_FAIL;
  58         }
  59
  60         /*
  61          * Translate UMC channel address to Physical address
  62          */
  63         ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
  64                                         ch_inst, umc_inst);
  65         if (ret)
  66                 goto out;
  67
  68         if (amdgpu_bad_page_threshold != 0) {
  69                 amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
  70                                                 err_data.err_addr_cnt);
  71                 amdgpu_ras_save_bad_pages(adev);
  72         }
  73
  74 out:
  75         kfree(err_data.err_addr);
  76         return ret;
  77 }
  78
  79 static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
  80                 void *ras_error_status,
  81                 struct amdgpu_iv_entry *entry,
  82                 bool reset)
  83 {
  84         struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
  85         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
  86         int ret = 0;
  87
  88         kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
  89         ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
  90         if (ret == -EOPNOTSUPP) {
  91                 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
  92                     adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
  93                     adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status);
  94
  95                 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
  96                     adev->umc.ras->ras_block.hw_ops->query_ras_error_address &&
  97                     adev->umc.max_ras_err_cnt_per_query) {
  98                         err_data->err_addr =
  99                                 kcalloc(adev->umc.max_ras_err_cnt_per_query,
 100                                         sizeof(struct eeprom_table_record), GFP_KERNEL);
 101
 102                         /* still call query_ras_error_address to clear error status
 103                          * even NOMEM error is encountered
 104                          */
 105                         if(!err_data->err_addr)
 106                                 dev_warn(adev->dev, "Failed to alloc memory for "
 107                                                 "umc error address record!\n");
 108
 109                         /* umc query_ras_error_address is also responsible for clearing
 110                          * error status
 111                          */
 112                         adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, ras_error_status);
 113                 }
 114         } else if (!ret) {
 115                 if (adev->umc.ras &&
 116                     adev->umc.ras->ecc_info_query_ras_error_count)
 117                     adev->umc.ras->ecc_info_query_ras_error_count(adev, ras_error_status);
 118
 119                 if (adev->umc.ras &&
 120                     adev->umc.ras->ecc_info_query_ras_error_address &&
 121                     adev->umc.max_ras_err_cnt_per_query) {
 122                         err_data->err_addr =
 123                                 kcalloc(adev->umc.max_ras_err_cnt_per_query,
 124                                         sizeof(struct eeprom_table_record), GFP_KERNEL);
 125
 126                         /* still call query_ras_error_address to clear error status
 127                          * even NOMEM error is encountered
 128                          */
 129                         if(!err_data->err_addr)
 130                                 dev_warn(adev->dev, "Failed to alloc memory for "
 131                                                 "umc error address record!\n");
 132
 133                         /* umc query_ras_error_address is also responsible for clearing
 134                          * error status
 135                          */
 136                         adev->umc.ras->ecc_info_query_ras_error_address(adev, ras_error_status);
 137                 }
 138         }
 139
 140         /* only uncorrectable error needs gpu reset */
 141         if (err_data->ue_count) {
 142                 dev_info(adev->dev, "%ld uncorrectable hardware errors "
 143                                 "detected in UMC block\n",
 144                                 err_data->ue_count);
 145
 146                 if ((amdgpu_bad_page_threshold != 0) &&
 147                         err_data->err_addr_cnt) {
 148                         amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
 149                                                 err_data->err_addr_cnt);
 150                         amdgpu_ras_save_bad_pages(adev);
 151
 152                         amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
 153
 154                         if (con->update_channel_flag == true) {
 155                                 amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
 156                                 con->update_channel_flag = false;
 157                         }
 158                 }
 159
 160                 if (reset)
 161                         amdgpu_ras_reset_gpu(adev);
 162         }
 163
 164         kfree(err_data->err_addr);
 165         return AMDGPU_RAS_SUCCESS;
 166 }
 167
 168 int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
 169 {
 170         int ret = AMDGPU_RAS_SUCCESS;
 171
 172         if (!adev->gmc.xgmi.connected_to_cpu) {
 173                 struct ras_err_data err_data = {0, 0, 0, NULL};
 174                 struct ras_common_if head = {
 175                         .block = AMDGPU_RAS_BLOCK__UMC,
 176                 };
 177                 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
 178
 179                 ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
 180
 181                 if (ret == AMDGPU_RAS_SUCCESS && obj) {
 182                         obj->err_data.ue_count += err_data.ue_count;
 183                         obj->err_data.ce_count += err_data.ce_count;
 184                 }
 185         } else if (reset) {
 186                 /* MCA poison handler is only responsible for GPU reset,
 187                  * let MCA notifier do page retirement.
 188                  */
 189                 kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
 190                 amdgpu_ras_reset_gpu(adev);
 191         }
 192
 193         return ret;
 194 }
 195
 196 int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
 197                 void *ras_error_status,
 198                 struct amdgpu_iv_entry *entry)
 199 {
 200         return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
 201 }
 202
 203 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
 204 {
 205         int r;
 206
 207         r = amdgpu_ras_block_late_init(adev, ras_block);
 208         if (r)
 209                 return r;
 210
 211         if (amdgpu_ras_is_supported(adev, ras_block->block)) {
 212                 r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
 213                 if (r)
 214                         goto late_fini;
 215         }
 216
 217         /* ras init of specific umc version */
 218         if (adev->umc.ras &&
 219             adev->umc.ras->err_cnt_init)
 220                 adev->umc.ras->err_cnt_init(adev);
 221
 222         return 0;
 223
 224 late_fini:
 225         amdgpu_ras_block_late_fini(adev, ras_block);
 226         return r;
 227 }
 228
 229 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
 230                 struct amdgpu_irq_src *source,
 231                 struct amdgpu_iv_entry *entry)
 232 {
 233         struct ras_common_if *ras_if = adev->umc.ras_if;
 234         struct ras_dispatch_if ih_data = {
 235                 .entry = entry,
 236         };
 237
 238         if (!ras_if)
 239                 return 0;
 240
 241         ih_data.head = *ras_if;
 242
 243         amdgpu_ras_interrupt_dispatch(adev, &ih_data);
 244         return 0;
 245 }
 246
 247 void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
 248                 uint64_t err_addr,
 249                 uint64_t retired_page,
 250                 uint32_t channel_index,
 251                 uint32_t umc_inst)
 252 {
 253         struct eeprom_table_record *err_rec =
 254                 &err_data->err_addr[err_data->err_addr_cnt];
 255
 256         err_rec->address = err_addr;
 257         /* page frame address is saved */
 258         err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
 259         err_rec->ts = (uint64_t)ktime_get_real_seconds();
 260         err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
 261         err_rec->cu = 0;
 262         err_rec->mem_channel = channel_index;
 263         err_rec->mcumc_id = umc_inst;
 264
 265         err_data->err_addr_cnt++;
 266 }