drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c

   1 /*
   2  * Copyright 2019 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  *
  22  */
  23
  24 #include "amdgpu_ras_eeprom.h"
  25 #include "amdgpu.h"
  26 #include "amdgpu_ras.h"
  27 #include <linux/bits.h>
  28 #include "atom.h"
  29
  30 #define EEPROM_I2C_TARGET_ADDR_VEGA20           0xA0
  31 #define EEPROM_I2C_TARGET_ADDR_ARCTURUS         0xA8
  32 #define EEPROM_I2C_TARGET_ADDR_ARCTURUS_D342    0xA0
  33
  34 /*
  35  * The 2 macros bellow represent the actual size in bytes that
  36  * those entities occupy in the EEPROM memory.
  37  * EEPROM_TABLE_RECORD_SIZE is different than sizeof(eeprom_table_record) which
  38  * uses uint64 to store 6b fields such as retired_page.
  39  */
  40 #define EEPROM_TABLE_HEADER_SIZE 20
  41 #define EEPROM_TABLE_RECORD_SIZE 24
  42
  43 #define EEPROM_ADDRESS_SIZE 0x2
  44
  45 /* Table hdr is 'AMDR' */
  46 #define EEPROM_TABLE_HDR_VAL 0x414d4452
  47 #define EEPROM_TABLE_VER 0x00010000
  48
  49 /* Bad GPU tag ‘BADG’ */
  50 #define EEPROM_TABLE_HDR_BAD 0x42414447
  51
  52 /* Assume 2 Mbit size */
  53 #define EEPROM_SIZE_BYTES 256000
  54 #define EEPROM_PAGE__SIZE_BYTES 256
  55 #define EEPROM_HDR_START 0
  56 #define EEPROM_RECORD_START (EEPROM_HDR_START + EEPROM_TABLE_HEADER_SIZE)
  57 #define EEPROM_MAX_RECORD_NUM ((EEPROM_SIZE_BYTES - EEPROM_TABLE_HEADER_SIZE) / EEPROM_TABLE_RECORD_SIZE)
  58 #define EEPROM_ADDR_MSB_MASK GENMASK(17, 8)
  59
  60 #define to_amdgpu_device(x) (container_of(x, struct amdgpu_ras, eeprom_control))->adev
  61
  62 static bool __is_ras_eeprom_supported(struct amdgpu_device *adev)
  63 {
  64         if ((adev->asic_type == CHIP_VEGA20) ||
  65             (adev->asic_type == CHIP_ARCTURUS))
  66                 return true;
  67
  68         return false;
  69 }
  70
  71 static bool __get_eeprom_i2c_addr_arct(struct amdgpu_device *adev,
  72                                        uint16_t *i2c_addr)
  73 {
  74         struct atom_context *atom_ctx = adev->mode_info.atom_context;
  75
  76         if (!i2c_addr || !atom_ctx)
  77                 return false;
  78
  79         if (strnstr(atom_ctx->vbios_version,
  80                     "D342",
  81                     sizeof(atom_ctx->vbios_version)))
  82                 *i2c_addr = EEPROM_I2C_TARGET_ADDR_ARCTURUS_D342;
  83         else
  84                 *i2c_addr = EEPROM_I2C_TARGET_ADDR_ARCTURUS;
  85
  86         return true;
  87 }
  88
  89 static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev,
  90                                   uint16_t *i2c_addr)
  91 {
  92         if (!i2c_addr)
  93                 return false;
  94
  95         switch (adev->asic_type) {
  96         case CHIP_VEGA20:
  97                 *i2c_addr = EEPROM_I2C_TARGET_ADDR_VEGA20;
  98                 break;
  99
 100         case CHIP_ARCTURUS:
 101                 return __get_eeprom_i2c_addr_arct(adev, i2c_addr);
 102
 103         default:
 104                 return false;
 105         }
 106
 107         return true;
 108 }
 109
 110 static void __encode_table_header_to_buff(struct amdgpu_ras_eeprom_table_header *hdr,
 111                                           unsigned char *buff)
 112 {
 113         uint32_t *pp = (uint32_t *) buff;
 114
 115         pp[0] = cpu_to_le32(hdr->header);
 116         pp[1] = cpu_to_le32(hdr->version);
 117         pp[2] = cpu_to_le32(hdr->first_rec_offset);
 118         pp[3] = cpu_to_le32(hdr->tbl_size);
 119         pp[4] = cpu_to_le32(hdr->checksum);
 120 }
 121
 122 static void __decode_table_header_from_buff(struct amdgpu_ras_eeprom_table_header *hdr,
 123                                           unsigned char *buff)
 124 {
 125         uint32_t *pp = (uint32_t *)buff;
 126
 127         hdr->header           = le32_to_cpu(pp[0]);
 128         hdr->version          = le32_to_cpu(pp[1]);
 129         hdr->first_rec_offset = le32_to_cpu(pp[2]);
 130         hdr->tbl_size         = le32_to_cpu(pp[3]);
 131         hdr->checksum         = le32_to_cpu(pp[4]);
 132 }
 133
 134 static int __update_table_header(struct amdgpu_ras_eeprom_control *control,
 135                                  unsigned char *buff)
 136 {
 137         int ret = 0;
 138         struct amdgpu_device *adev = to_amdgpu_device(control);
 139         struct i2c_msg msg = {
 140                         .addr   = 0,
 141                         .flags  = 0,
 142                         .len    = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE,
 143                         .buf    = buff,
 144         };
 145
 146
 147         *(uint16_t *)buff = EEPROM_HDR_START;
 148         __encode_table_header_to_buff(&control->tbl_hdr, buff + EEPROM_ADDRESS_SIZE);
 149
 150         msg.addr = control->i2c_address;
 151
 152         /* i2c may be unstable in gpu reset */
 153         down_read(&adev->reset_sem);
 154         ret = i2c_transfer(&adev->pm.smu_i2c, &msg, 1);
 155         up_read(&adev->reset_sem);
 156
 157         if (ret < 1)
 158                 DRM_ERROR("Failed to write EEPROM table header, ret:%d", ret);
 159
 160         return ret;
 161 }
 162
 163 static uint32_t  __calc_hdr_byte_sum(struct amdgpu_ras_eeprom_control *control)
 164 {
 165         int i;
 166         uint32_t tbl_sum = 0;
 167
 168         /* Header checksum, skip checksum field in the calculation */
 169         for (i = 0; i < sizeof(control->tbl_hdr) - sizeof(control->tbl_hdr.checksum); i++)
 170                 tbl_sum += *(((unsigned char *)&control->tbl_hdr) + i);
 171
 172         return tbl_sum;
 173 }
 174
 175 static uint32_t  __calc_recs_byte_sum(struct eeprom_table_record *records,
 176                                       int num)
 177 {
 178         int i, j;
 179         uint32_t tbl_sum = 0;
 180
 181         /* Records checksum */
 182         for (i = 0; i < num; i++) {
 183                 struct eeprom_table_record *record = &records[i];
 184
 185                 for (j = 0; j < sizeof(*record); j++) {
 186                         tbl_sum += *(((unsigned char *)record) + j);
 187                 }
 188         }
 189
 190         return tbl_sum;
 191 }
 192
 193 static inline uint32_t  __calc_tbl_byte_sum(struct amdgpu_ras_eeprom_control *control,
 194                                   struct eeprom_table_record *records, int num)
 195 {
 196         return __calc_hdr_byte_sum(control) + __calc_recs_byte_sum(records, num);
 197 }
 198
 199 /* Checksum = 256 -((sum of all table entries) mod 256) */
 200 static void __update_tbl_checksum(struct amdgpu_ras_eeprom_control *control,
 201                                   struct eeprom_table_record *records, int num,
 202                                   uint32_t old_hdr_byte_sum)
 203 {
 204         /*
 205          * This will update the table sum with new records.
 206          *
 207          * TODO: What happens when the EEPROM table is to be wrapped around
 208          * and old records from start will get overridden.
 209          */
 210
 211         /* need to recalculate updated header byte sum */
 212         control->tbl_byte_sum -= old_hdr_byte_sum;
 213         control->tbl_byte_sum += __calc_tbl_byte_sum(control, records, num);
 214
 215         control->tbl_hdr.checksum = 256 - (control->tbl_byte_sum % 256);
 216 }
 217
 218 /* table sum mod 256 + checksum must equals 256 */
 219 static bool __validate_tbl_checksum(struct amdgpu_ras_eeprom_control *control,
 220                             struct eeprom_table_record *records, int num)
 221 {
 222         control->tbl_byte_sum = __calc_tbl_byte_sum(control, records, num);
 223
 224         if (control->tbl_hdr.checksum + (control->tbl_byte_sum % 256) != 256) {
 225                 DRM_WARN("Checksum mismatch, checksum: %u ", control->tbl_hdr.checksum);
 226                 return false;
 227         }
 228
 229         return true;
 230 }
 231
 232 static int amdgpu_ras_eeprom_correct_header_tag(
 233                                 struct amdgpu_ras_eeprom_control *control,
 234                                 uint32_t header)
 235 {
 236         unsigned char buff[EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE];
 237         struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
 238         int ret = 0;
 239
 240         memset(buff, 0, EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE);
 241
 242         mutex_lock(&control->tbl_mutex);
 243         hdr->header = header;
 244         ret = __update_table_header(control, buff);
 245         mutex_unlock(&control->tbl_mutex);
 246
 247         return ret;
 248 }
 249
 250 int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
 251 {
 252         unsigned char buff[EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE] = { 0 };
 253         struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
 254         int ret = 0;
 255
 256         mutex_lock(&control->tbl_mutex);
 257
 258         hdr->header = EEPROM_TABLE_HDR_VAL;
 259         hdr->version = EEPROM_TABLE_VER;
 260         hdr->first_rec_offset = EEPROM_RECORD_START;
 261         hdr->tbl_size = EEPROM_TABLE_HEADER_SIZE;
 262
 263         control->tbl_byte_sum = 0;
 264         __update_tbl_checksum(control, NULL, 0, 0);
 265         control->next_addr = EEPROM_RECORD_START;
 266
 267         ret = __update_table_header(control, buff);
 268
 269         mutex_unlock(&control->tbl_mutex);
 270
 271         return ret;
 272
 273 }
 274
 275 int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
 276                         bool *exceed_err_limit)
 277 {
 278         int ret = 0;
 279         struct amdgpu_device *adev = to_amdgpu_device(control);
 280         unsigned char buff[EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE] = { 0 };
 281         struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
 282         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 283         struct i2c_msg msg = {
 284                         .addr   = 0,
 285                         .flags  = I2C_M_RD,
 286                         .len    = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE,
 287                         .buf    = buff,
 288         };
 289
 290         *exceed_err_limit = false;
 291
 292         if (!__is_ras_eeprom_supported(adev))
 293                 return 0;
 294
 295         /* Verify i2c adapter is initialized */
 296         if (!adev->pm.smu_i2c.algo)
 297                 return -ENOENT;
 298
 299         if (!__get_eeprom_i2c_addr(adev, &control->i2c_address))
 300                 return -EINVAL;
 301
 302         mutex_init(&control->tbl_mutex);
 303
 304         msg.addr = control->i2c_address;
 305         /* Read/Create table header from EEPROM address 0 */
 306         ret = i2c_transfer(&adev->pm.smu_i2c, &msg, 1);
 307         if (ret < 1) {
 308                 DRM_ERROR("Failed to read EEPROM table header, ret:%d", ret);
 309                 return ret;
 310         }
 311
 312         __decode_table_header_from_buff(hdr, &buff[2]);
 313
 314         if (hdr->header == EEPROM_TABLE_HDR_VAL) {
 315                 control->num_recs = (hdr->tbl_size - EEPROM_TABLE_HEADER_SIZE) /
 316                                     EEPROM_TABLE_RECORD_SIZE;
 317                 control->tbl_byte_sum = __calc_hdr_byte_sum(control);
 318                 control->next_addr = EEPROM_RECORD_START;
 319
 320                 DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
 321                                  control->num_recs);
 322
 323         } else if ((hdr->header == EEPROM_TABLE_HDR_BAD) &&
 324                         (amdgpu_bad_page_threshold != 0)) {
 325                 if (ras->bad_page_cnt_threshold > control->num_recs) {
 326                         dev_info(adev->dev, "Using one valid bigger bad page "
 327                                 "threshold and correcting eeprom header tag.\n");
 328                         ret = amdgpu_ras_eeprom_correct_header_tag(control,
 329                                                         EEPROM_TABLE_HDR_VAL);
 330                 } else {
 331                         *exceed_err_limit = true;
 332                         dev_err(adev->dev, "Exceeding the bad_page_threshold parameter, "
 333                                 "disabling the GPU.\n");
 334                 }
 335         } else {
 336                 DRM_INFO("Creating new EEPROM table");
 337
 338                 ret = amdgpu_ras_eeprom_reset_table(control);
 339         }
 340
 341         return ret == 1 ? 0 : -EIO;
 342 }
 343
 344 static void __encode_table_record_to_buff(struct amdgpu_ras_eeprom_control *control,
 345                                           struct eeprom_table_record *record,
 346                                           unsigned char *buff)
 347 {
 348         __le64 tmp = 0;
 349         int i = 0;
 350
 351         /* Next are all record fields according to EEPROM page spec in LE foramt */
 352         buff[i++] = record->err_type;
 353
 354         buff[i++] = record->bank;
 355
 356         tmp = cpu_to_le64(record->ts);
 357         memcpy(buff + i, &tmp, 8);
 358         i += 8;
 359
 360         tmp = cpu_to_le64((record->offset & 0xffffffffffff));
 361         memcpy(buff + i, &tmp, 6);
 362         i += 6;
 363
 364         buff[i++] = record->mem_channel;
 365         buff[i++] = record->mcumc_id;
 366
 367         tmp = cpu_to_le64((record->retired_page & 0xffffffffffff));
 368         memcpy(buff + i, &tmp, 6);
 369 }
 370
 371 static void __decode_table_record_from_buff(struct amdgpu_ras_eeprom_control *control,
 372                                             struct eeprom_table_record *record,
 373                                             unsigned char *buff)
 374 {
 375         __le64 tmp = 0;
 376         int i =  0;
 377
 378         /* Next are all record fields according to EEPROM page spec in LE foramt */
 379         record->err_type = buff[i++];
 380
 381         record->bank = buff[i++];
 382
 383         memcpy(&tmp, buff + i, 8);
 384         record->ts = le64_to_cpu(tmp);
 385         i += 8;
 386
 387         memcpy(&tmp, buff + i, 6);
 388         record->offset = (le64_to_cpu(tmp) & 0xffffffffffff);
 389         i += 6;
 390
 391         record->mem_channel = buff[i++];
 392         record->mcumc_id = buff[i++];
 393
 394         memcpy(&tmp, buff + i,  6);
 395         record->retired_page = (le64_to_cpu(tmp) & 0xffffffffffff);
 396 }
 397
 398 /*
 399  * When reaching end of EEPROM memory jump back to 0 record address
 400  * When next record access will go beyond EEPROM page boundary modify bits A17/A8
 401  * in I2C selector to go to next page
 402  */
 403 static uint32_t __correct_eeprom_dest_address(uint32_t curr_address)
 404 {
 405         uint32_t next_address = curr_address + EEPROM_TABLE_RECORD_SIZE;
 406
 407         /* When all EEPROM memory used jump back to 0 address */
 408         if (next_address > EEPROM_SIZE_BYTES) {
 409                 DRM_INFO("Reached end of EEPROM memory, jumping to 0 "
 410                          "and overriding old record");
 411                 return EEPROM_RECORD_START;
 412         }
 413
 414         /*
 415          * To check if we overflow page boundary  compare next address with
 416          * current and see if bits 17/8 of the EEPROM address will change
 417          * If they do start from the next 256b page
 418          *
 419          * https://www.st.com/resource/en/datasheet/m24m02-dr.pdf sec. 5.1.2
 420          */
 421         if ((curr_address & EEPROM_ADDR_MSB_MASK) != (next_address & EEPROM_ADDR_MSB_MASK)) {
 422                 DRM_DEBUG_DRIVER("Reached end of EEPROM memory page, jumping to next: %lx",
 423                                 (next_address & EEPROM_ADDR_MSB_MASK));
 424
 425                 return  (next_address & EEPROM_ADDR_MSB_MASK);
 426         }
 427
 428         return curr_address;
 429 }
 430
 431 int amdgpu_ras_eeprom_check_err_threshold(
 432                                 struct amdgpu_ras_eeprom_control *control,
 433                                 bool *exceed_err_limit)
 434 {
 435         struct amdgpu_device *adev = to_amdgpu_device(control);
 436         unsigned char buff[EEPROM_ADDRESS_SIZE +
 437                         EEPROM_TABLE_HEADER_SIZE] = { 0 };
 438         struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
 439         struct i2c_msg msg = {
 440                         .addr = control->i2c_address,
 441                         .flags = I2C_M_RD,
 442                         .len = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE,
 443                         .buf = buff,
 444         };
 445         int ret;
 446
 447         *exceed_err_limit = false;
 448
 449         if (!__is_ras_eeprom_supported(adev))
 450                 return 0;
 451
 452         /* read EEPROM table header */
 453         mutex_lock(&control->tbl_mutex);
 454         ret = i2c_transfer(&adev->pm.smu_i2c, &msg, 1);
 455         if (ret < 1) {
 456                 dev_err(adev->dev, "Failed to read EEPROM table header.\n");
 457                 goto err;
 458         }
 459
 460         __decode_table_header_from_buff(hdr, &buff[2]);
 461
 462         if (hdr->header == EEPROM_TABLE_HDR_BAD) {
 463                 dev_warn(adev->dev, "This GPU is in BAD status.");
 464                 dev_warn(adev->dev, "Please retire it or setting one bigger "
 465                                 "threshold value when reloading driver.\n");
 466                 *exceed_err_limit = true;
 467         }
 468
 469 err:
 470         mutex_unlock(&control->tbl_mutex);
 471         return 0;
 472 }
 473
 474 int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
 475                                             struct eeprom_table_record *records,
 476                                             bool write,
 477                                             int num)
 478 {
 479         int i, ret = 0;
 480         struct i2c_msg *msgs, *msg;
 481         unsigned char *buffs, *buff;
 482         struct eeprom_table_record *record;
 483         struct amdgpu_device *adev = to_amdgpu_device(control);
 484         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 485
 486         if (!__is_ras_eeprom_supported(adev))
 487                 return 0;
 488
 489         buffs = kcalloc(num, EEPROM_ADDRESS_SIZE + EEPROM_TABLE_RECORD_SIZE,
 490                          GFP_KERNEL);
 491         if (!buffs)
 492                 return -ENOMEM;
 493
 494         mutex_lock(&control->tbl_mutex);
 495
 496         msgs = kcalloc(num, sizeof(*msgs), GFP_KERNEL);
 497         if (!msgs) {
 498                 ret = -ENOMEM;
 499                 goto free_buff;
 500         }
 501
 502         /*
 503          * If saved bad pages number exceeds the bad page threshold for
 504          * the whole VRAM, update table header to mark the BAD GPU tag
 505          * and schedule one ras recovery after eeprom write is done,
 506          * this can avoid the missing for latest records.
 507          *
 508          * This new header will be picked up and checked in the bootup
 509          * by ras recovery, which may break bootup process to notify
 510          * user this GPU is in bad state and to retire such GPU for
 511          * further check.
 512          */
 513         if (write && (amdgpu_bad_page_threshold != 0) &&
 514                 ((control->num_recs + num) >= ras->bad_page_cnt_threshold)) {
 515                 dev_warn(adev->dev,
 516                         "Saved bad pages(%d) reaches threshold value(%d).\n",
 517                         control->num_recs + num, ras->bad_page_cnt_threshold);
 518                 control->tbl_hdr.header = EEPROM_TABLE_HDR_BAD;
 519         }
 520
 521         /* In case of overflow just start from beginning to not lose newest records */
 522         if (write && (control->next_addr + EEPROM_TABLE_RECORD_SIZE * num > EEPROM_SIZE_BYTES))
 523                 control->next_addr = EEPROM_RECORD_START;
 524
 525         /*
 526          * TODO Currently makes EEPROM writes for each record, this creates
 527          * internal fragmentation. Optimized the code to do full page write of
 528          * 256b
 529          */
 530         for (i = 0; i < num; i++) {
 531                 buff = &buffs[i * (EEPROM_ADDRESS_SIZE + EEPROM_TABLE_RECORD_SIZE)];
 532                 record = &records[i];
 533                 msg = &msgs[i];
 534
 535                 control->next_addr = __correct_eeprom_dest_address(control->next_addr);
 536
 537                 /*
 538                  * Update bits 16,17 of EEPROM address in I2C address by setting them
 539                  * to bits 1,2 of Device address byte
 540                  */
 541                 msg->addr = control->i2c_address |
 542                                 ((control->next_addr & EEPROM_ADDR_MSB_MASK) >> 15);
 543                 msg->flags      = write ? 0 : I2C_M_RD;
 544                 msg->len        = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_RECORD_SIZE;
 545                 msg->buf        = buff;
 546
 547                 /* Insert the EEPROM dest addess, bits 0-15 */
 548                 buff[0] = ((control->next_addr >> 8) & 0xff);
 549                 buff[1] = (control->next_addr & 0xff);
 550
 551                 /* EEPROM table content is stored in LE format */
 552                 if (write)
 553                         __encode_table_record_to_buff(control, record, buff + EEPROM_ADDRESS_SIZE);
 554
 555                 /*
 556                  * The destination EEPROM address might need to be corrected to account
 557                  * for page or entire memory wrapping
 558                  */
 559                 control->next_addr += EEPROM_TABLE_RECORD_SIZE;
 560         }
 561
 562         /* i2c may be unstable in gpu reset */
 563         down_read(&adev->reset_sem);
 564         ret = i2c_transfer(&adev->pm.smu_i2c, msgs, num);
 565         up_read(&adev->reset_sem);
 566
 567         if (ret < 1) {
 568                 DRM_ERROR("Failed to process EEPROM table records, ret:%d", ret);
 569
 570                 /* TODO Restore prev next EEPROM address ? */
 571                 goto free_msgs;
 572         }
 573
 574
 575         if (!write) {
 576                 for (i = 0; i < num; i++) {
 577                         buff = &buffs[i*(EEPROM_ADDRESS_SIZE + EEPROM_TABLE_RECORD_SIZE)];
 578                         record = &records[i];
 579
 580                         __decode_table_record_from_buff(control, record, buff + EEPROM_ADDRESS_SIZE);
 581                 }
 582         }
 583
 584         if (write) {
 585                 uint32_t old_hdr_byte_sum = __calc_hdr_byte_sum(control);
 586
 587                 /*
 588                  * Update table header with size and CRC and account for table
 589                  * wrap around where the assumption is that we treat it as empty
 590                  * table
 591                  *
 592                  * TODO - Check the assumption is correct
 593                  */
 594                 control->num_recs += num;
 595                 control->num_recs %= EEPROM_MAX_RECORD_NUM;
 596                 control->tbl_hdr.tbl_size += EEPROM_TABLE_RECORD_SIZE * num;
 597                 if (control->tbl_hdr.tbl_size > EEPROM_SIZE_BYTES)
 598                         control->tbl_hdr.tbl_size = EEPROM_TABLE_HEADER_SIZE +
 599                         control->num_recs * EEPROM_TABLE_RECORD_SIZE;
 600
 601                 __update_tbl_checksum(control, records, num, old_hdr_byte_sum);
 602
 603                 __update_table_header(control, buffs);
 604         } else if (!__validate_tbl_checksum(control, records, num)) {
 605                 DRM_WARN("EEPROM Table checksum mismatch!");
 606                 /* TODO Uncomment when EEPROM read/write is relliable */
 607                 /* ret = -EIO; */
 608         }
 609
 610 free_msgs:
 611         kfree(msgs);
 612
 613 free_buff:
 614         kfree(buffs);
 615
 616         mutex_unlock(&control->tbl_mutex);
 617
 618         return ret == num ? 0 : -EIO;
 619 }
 620
 621 inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void)
 622 {
 623         return EEPROM_MAX_RECORD_NUM;
 624 }
 625
 626 /* Used for testing if bugs encountered */
 627 #if 0
 628 void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control)
 629 {
 630         int i;
 631         struct eeprom_table_record *recs = kcalloc(1, sizeof(*recs), GFP_KERNEL);
 632
 633         if (!recs)
 634                 return;
 635
 636         for (i = 0; i < 1 ; i++) {
 637                 recs[i].address = 0xdeadbeef;
 638                 recs[i].retired_page = i;
 639         }
 640
 641         if (!amdgpu_ras_eeprom_process_recods(control, recs, true, 1)) {
 642
 643                 memset(recs, 0, sizeof(*recs) * 1);
 644
 645                 control->next_addr = EEPROM_RECORD_START;
 646
 647                 if (!amdgpu_ras_eeprom_process_recods(control, recs, false, 1)) {
 648                         for (i = 0; i < 1; i++)
 649                                 DRM_INFO("rec.address :0x%llx, rec.retired_page :%llu",
 650                                          recs[i].address, recs[i].retired_page);
 651                 } else
 652                         DRM_ERROR("Failed in reading from table");
 653
 654         } else
 655                 DRM_ERROR("Failed in writing to table");
 656 }
 657 #endif