Linux 6.9-rc1
[linux-2.6-microblaze.git] / drivers / gpu / drm / amd / amdgpu / umc_v8_7.c
1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 #include "umc_v8_7.h"
24 #include "amdgpu_ras.h"
25 #include "amdgpu_umc.h"
26 #include "amdgpu.h"
27
28 #include "rsmu/rsmu_0_0_2_offset.h"
29 #include "rsmu/rsmu_0_0_2_sh_mask.h"
30 #include "umc/umc_8_7_0_offset.h"
31 #include "umc/umc_8_7_0_sh_mask.h"
32
33 #define UMC_8_INST_DIST                 0x40000
34
35 const uint32_t
36         umc_v8_7_channel_idx_tbl[UMC_V8_7_UMC_INSTANCE_NUM][UMC_V8_7_CHANNEL_INSTANCE_NUM] = {
37                 {2, 11},  {4, 13},
38                 {1, 8},   {7, 14},
39                 {10, 3},  {12, 5},
40                 {9, 0},   {15, 6}
41 };
42
43 static inline uint32_t get_umc_v8_7_reg_offset(struct amdgpu_device *adev,
44                                             uint32_t umc_inst,
45                                             uint32_t ch_inst)
46 {
47         return adev->umc.channel_offs*ch_inst + UMC_8_INST_DIST*umc_inst;
48 }
49
50 static void umc_v8_7_ecc_info_query_correctable_error_count(struct amdgpu_device *adev,
51                                                 uint32_t umc_inst, uint32_t ch_inst,
52                                                 unsigned long *error_count)
53 {
54         uint64_t mc_umc_status;
55         uint32_t eccinfo_table_idx;
56         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
57
58         eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
59
60         /* check for SRAM correctable error
61          * MCUMC_STATUS is a 64 bit register
62          */
63         mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
64         if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
65             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
66                 *error_count += 1;
67 }
68
69 static void umc_v8_7_ecc_info_querry_uncorrectable_error_count(struct amdgpu_device *adev,
70                                                         uint32_t umc_inst, uint32_t ch_inst,
71                                                         unsigned long *error_count)
72 {
73         uint64_t mc_umc_status;
74         uint32_t eccinfo_table_idx;
75         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
76
77         eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
78
79         /* check the MCUMC_STATUS */
80         mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
81         if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
82             (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
83             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
84             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
85             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
86             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
87                 *error_count += 1;
88 }
89
90 static void umc_v8_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
91                                         void *ras_error_status)
92 {
93         struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
94
95         uint32_t umc_inst        = 0;
96         uint32_t ch_inst         = 0;
97
98         /* TODO: driver needs to toggle DF Cstate to ensure
99          * safe access of UMC registers. Will add the protection
100          */
101         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
102                 umc_v8_7_ecc_info_query_correctable_error_count(adev,
103                                                         umc_inst, ch_inst,
104                                                         &(err_data->ce_count));
105                 umc_v8_7_ecc_info_querry_uncorrectable_error_count(adev,
106                                                         umc_inst, ch_inst,
107                                                         &(err_data->ue_count));
108         }
109 }
110
111 static void umc_v8_7_convert_error_address(struct amdgpu_device *adev,
112                                         struct ras_err_data *err_data, uint64_t err_addr,
113                                         uint32_t ch_inst, uint32_t umc_inst)
114 {
115         uint64_t retired_page;
116         uint32_t channel_index;
117
118         channel_index =
119                 adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
120
121         /* translate umc channel address to soc pa, 3 parts are included */
122         retired_page = ADDR_OF_4KB_BLOCK(err_addr) |
123                         ADDR_OF_256B_BLOCK(channel_index) |
124                         OFFSET_IN_256B_BLOCK(err_addr);
125
126         amdgpu_umc_fill_error_record(err_data, err_addr,
127                                 retired_page, channel_index, umc_inst);
128 }
129
130 static void umc_v8_7_ecc_info_query_error_address(struct amdgpu_device *adev,
131                                         struct ras_err_data *err_data,
132                                         uint32_t ch_inst,
133                                         uint32_t umc_inst)
134 {
135         uint64_t mc_umc_status, err_addr;
136         uint32_t eccinfo_table_idx;
137         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
138
139         eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
140         mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
141
142         if (mc_umc_status == 0)
143                 return;
144
145         if (!err_data->err_addr)
146                 return;
147
148         /* calculate error address if ue error is detected */
149         if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
150             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) {
151
152                 err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr;
153                 err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
154
155                 umc_v8_7_convert_error_address(adev, err_data, err_addr,
156                                                 ch_inst, umc_inst);
157         }
158 }
159
160 static void umc_v8_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
161                                         void *ras_error_status)
162 {
163         struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
164
165         uint32_t umc_inst        = 0;
166         uint32_t ch_inst         = 0;
167
168         /* TODO: driver needs to toggle DF Cstate to ensure
169          * safe access of UMC resgisters. Will add the protection
170          * when firmware interface is ready
171          */
172         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
173                 umc_v8_7_ecc_info_query_error_address(adev,
174                                                 err_data,
175                                                 ch_inst,
176                                                 umc_inst);
177         }
178 }
179
180 static void umc_v8_7_clear_error_count_per_channel(struct amdgpu_device *adev,
181                                         uint32_t umc_reg_offset)
182 {
183         uint32_t ecc_err_cnt_addr;
184         uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
185
186         ecc_err_cnt_sel_addr =
187                 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
188         ecc_err_cnt_addr =
189                 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
190
191         /* select the lower chip */
192         ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
193                                         umc_reg_offset) * 4);
194         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
195                                         UMCCH0_0_GeccErrCntSel,
196                                         GeccErrCntCsSel, 0);
197         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
198                         ecc_err_cnt_sel);
199
200         /* clear lower chip error count */
201         WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
202                         UMC_V8_7_CE_CNT_INIT);
203
204         /* select the higher chip */
205         ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
206                                         umc_reg_offset) * 4);
207         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
208                                         UMCCH0_0_GeccErrCntSel,
209                                         GeccErrCntCsSel, 1);
210         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
211                         ecc_err_cnt_sel);
212
213         /* clear higher chip error count */
214         WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
215                         UMC_V8_7_CE_CNT_INIT);
216 }
217
218 static void umc_v8_7_clear_error_count(struct amdgpu_device *adev)
219 {
220         uint32_t umc_inst        = 0;
221         uint32_t ch_inst         = 0;
222         uint32_t umc_reg_offset  = 0;
223
224         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
225                 umc_reg_offset = get_umc_v8_7_reg_offset(adev,
226                                                 umc_inst,
227                                                 ch_inst);
228
229                 umc_v8_7_clear_error_count_per_channel(adev,
230                                                 umc_reg_offset);
231         }
232 }
233
234 static void umc_v8_7_query_correctable_error_count(struct amdgpu_device *adev,
235                                                    uint32_t umc_reg_offset,
236                                                    unsigned long *error_count)
237 {
238         uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
239         uint32_t ecc_err_cnt, ecc_err_cnt_addr;
240         uint64_t mc_umc_status;
241         uint32_t mc_umc_status_addr;
242
243         /* UMC 8_7_2 registers */
244         ecc_err_cnt_sel_addr =
245                 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
246         ecc_err_cnt_addr =
247                 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
248         mc_umc_status_addr =
249                 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
250
251         /* select the lower chip and check the error count */
252         ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
253         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
254                                         GeccErrCntCsSel, 0);
255         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
256
257         ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
258         *error_count +=
259                 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_GeccErrCnt, GeccErrCnt) -
260                  UMC_V8_7_CE_CNT_INIT);
261
262         /* select the higher chip and check the err counter */
263         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
264                                         GeccErrCntCsSel, 1);
265         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
266
267         ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
268         *error_count +=
269                 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_GeccErrCnt, GeccErrCnt) -
270                  UMC_V8_7_CE_CNT_INIT);
271
272         /* check for SRAM correctable error
273           MCUMC_STATUS is a 64 bit register */
274         mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
275         if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 6 &&
276             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
277             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
278                 *error_count += 1;
279 }
280
281 static void umc_v8_7_querry_uncorrectable_error_count(struct amdgpu_device *adev,
282                                                       uint32_t umc_reg_offset,
283                                                       unsigned long *error_count)
284 {
285         uint64_t mc_umc_status;
286         uint32_t mc_umc_status_addr;
287
288         mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
289
290         /* check the MCUMC_STATUS */
291         mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
292         if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
293             (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
294             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
295             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
296             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
297             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
298                 *error_count += 1;
299 }
300
301 static void umc_v8_7_query_ras_error_count(struct amdgpu_device *adev,
302                                            void *ras_error_status)
303 {
304         struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
305
306         uint32_t umc_inst        = 0;
307         uint32_t ch_inst         = 0;
308         uint32_t umc_reg_offset  = 0;
309
310         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
311                 umc_reg_offset = get_umc_v8_7_reg_offset(adev,
312                                                       umc_inst,
313                                                       ch_inst);
314
315                 umc_v8_7_query_correctable_error_count(adev,
316                                                        umc_reg_offset,
317                                                        &(err_data->ce_count));
318                 umc_v8_7_querry_uncorrectable_error_count(adev,
319                                                           umc_reg_offset,
320                                                           &(err_data->ue_count));
321         }
322
323         umc_v8_7_clear_error_count(adev);
324 }
325
326 static void umc_v8_7_query_error_address(struct amdgpu_device *adev,
327                                          struct ras_err_data *err_data,
328                                          uint32_t umc_reg_offset,
329                                          uint32_t ch_inst,
330                                          uint32_t umc_inst)
331 {
332         uint32_t lsb, mc_umc_status_addr;
333         uint64_t mc_umc_status, err_addr, mc_umc_addrt0;
334
335         mc_umc_status_addr =
336                 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
337         mc_umc_addrt0 =
338                 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_ADDRT0);
339         mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
340
341         if (mc_umc_status == 0)
342                 return;
343
344         if (!err_data->err_addr) {
345                 /* clear umc status */
346                 WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
347                 return;
348         }
349
350         /* calculate error address if ue error is detected */
351         if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
352             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) {
353
354                 err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
355                 /* the lowest lsb bits should be ignored */
356                 lsb = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, LSB);
357                 err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
358                 err_addr &= ~((0x1ULL << lsb) - 1);
359
360                 umc_v8_7_convert_error_address(adev, err_data, err_addr,
361                                                                 ch_inst, umc_inst);
362         }
363
364         /* clear umc status */
365         WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
366 }
367
368 static void umc_v8_7_query_ras_error_address(struct amdgpu_device *adev,
369                                              void *ras_error_status)
370 {
371         struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
372
373         uint32_t umc_inst        = 0;
374         uint32_t ch_inst         = 0;
375         uint32_t umc_reg_offset  = 0;
376
377         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
378                 umc_reg_offset = get_umc_v8_7_reg_offset(adev,
379                                                       umc_inst,
380                                                       ch_inst);
381
382                 umc_v8_7_query_error_address(adev,
383                                              err_data,
384                                              umc_reg_offset,
385                                              ch_inst,
386                                              umc_inst);
387         }
388 }
389
390 static void umc_v8_7_err_cnt_init_per_channel(struct amdgpu_device *adev,
391                                               uint32_t umc_reg_offset)
392 {
393         uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
394         uint32_t ecc_err_cnt_addr;
395
396         ecc_err_cnt_sel_addr =
397                 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
398         ecc_err_cnt_addr =
399                 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
400
401         /* select the lower chip and check the error count */
402         ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
403         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
404                                         GeccErrCntCsSel, 0);
405         /* set ce error interrupt type to APIC based interrupt */
406         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
407                                         GeccErrInt, 0x1);
408         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
409         /* set error count to initial value */
410         WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_7_CE_CNT_INIT);
411
412         /* select the higher chip and check the err counter */
413         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
414                                         GeccErrCntCsSel, 1);
415         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
416         WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_7_CE_CNT_INIT);
417 }
418
419 static void umc_v8_7_err_cnt_init(struct amdgpu_device *adev)
420 {
421         uint32_t umc_inst        = 0;
422         uint32_t ch_inst         = 0;
423         uint32_t umc_reg_offset  = 0;
424
425         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
426                 umc_reg_offset = get_umc_v8_7_reg_offset(adev,
427                                                       umc_inst,
428                                                       ch_inst);
429
430                 umc_v8_7_err_cnt_init_per_channel(adev, umc_reg_offset);
431         }
432 }
433
434 const struct amdgpu_ras_block_hw_ops umc_v8_7_ras_hw_ops = {
435         .query_ras_error_count = umc_v8_7_query_ras_error_count,
436         .query_ras_error_address = umc_v8_7_query_ras_error_address,
437 };
438
439 struct amdgpu_umc_ras umc_v8_7_ras = {
440         .ras_block = {
441                 .hw_ops = &umc_v8_7_ras_hw_ops,
442         },
443         .err_cnt_init = umc_v8_7_err_cnt_init,
444         .ecc_info_query_ras_error_count = umc_v8_7_ecc_info_query_ras_error_count,
445         .ecc_info_query_ras_error_address = umc_v8_7_ecc_info_query_ras_error_address,
446 };