Merge tag 'for-linus-5.15-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-microblaze.git] / drivers / gpu / drm / amd / amdgpu / umc_v8_7.c
1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 #include "umc_v8_7.h"
24 #include "amdgpu_ras.h"
25 #include "amdgpu_umc.h"
26 #include "amdgpu.h"
27
28 #include "rsmu/rsmu_0_0_2_offset.h"
29 #include "rsmu/rsmu_0_0_2_sh_mask.h"
30 #include "umc/umc_8_7_0_offset.h"
31 #include "umc/umc_8_7_0_sh_mask.h"
32
33 #define UMC_8_INST_DIST                 0x40000
34
35 const uint32_t
36         umc_v8_7_channel_idx_tbl[UMC_V8_7_UMC_INSTANCE_NUM][UMC_V8_7_CHANNEL_INSTANCE_NUM] = {
37                 {2, 11},  {4, 13},
38                 {1, 8},   {7, 14},
39                 {10, 3},  {12, 5},
40                 {9, 0},   {15, 6}
41 };
42
43 static inline uint32_t get_umc_8_reg_offset(struct amdgpu_device *adev,
44                                             uint32_t umc_inst,
45                                             uint32_t ch_inst)
46 {
47         return adev->umc.channel_offs*ch_inst + UMC_8_INST_DIST*umc_inst;
48 }
49
50 static void umc_v8_7_clear_error_count_per_channel(struct amdgpu_device *adev,
51                                         uint32_t umc_reg_offset)
52 {
53         uint32_t ecc_err_cnt_addr;
54         uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
55
56         ecc_err_cnt_sel_addr =
57                 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
58         ecc_err_cnt_addr =
59                 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
60
61         /* select the lower chip */
62         ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
63                                         umc_reg_offset) * 4);
64         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
65                                         UMCCH0_0_GeccErrCntSel,
66                                         GeccErrCntCsSel, 0);
67         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
68                         ecc_err_cnt_sel);
69
70         /* clear lower chip error count */
71         WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
72                         UMC_V8_7_CE_CNT_INIT);
73
74         /* select the higher chip */
75         ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
76                                         umc_reg_offset) * 4);
77         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
78                                         UMCCH0_0_GeccErrCntSel,
79                                         GeccErrCntCsSel, 1);
80         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
81                         ecc_err_cnt_sel);
82
83         /* clear higher chip error count */
84         WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
85                         UMC_V8_7_CE_CNT_INIT);
86 }
87
88 static void umc_v8_7_clear_error_count(struct amdgpu_device *adev)
89 {
90         uint32_t umc_inst        = 0;
91         uint32_t ch_inst         = 0;
92         uint32_t umc_reg_offset  = 0;
93
94         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
95                 umc_reg_offset = get_umc_8_reg_offset(adev,
96                                                 umc_inst,
97                                                 ch_inst);
98
99                 umc_v8_7_clear_error_count_per_channel(adev,
100                                                 umc_reg_offset);
101         }
102 }
103
104 static void umc_v8_7_query_correctable_error_count(struct amdgpu_device *adev,
105                                                    uint32_t umc_reg_offset,
106                                                    unsigned long *error_count)
107 {
108         uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
109         uint32_t ecc_err_cnt, ecc_err_cnt_addr;
110         uint64_t mc_umc_status;
111         uint32_t mc_umc_status_addr;
112
113         /* UMC 8_7_2 registers */
114         ecc_err_cnt_sel_addr =
115                 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
116         ecc_err_cnt_addr =
117                 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
118         mc_umc_status_addr =
119                 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
120
121         /* select the lower chip and check the error count */
122         ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
123         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
124                                         GeccErrCntCsSel, 0);
125         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
126
127         ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
128         *error_count +=
129                 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_GeccErrCnt, GeccErrCnt) -
130                  UMC_V8_7_CE_CNT_INIT);
131
132         /* select the higher chip and check the err counter */
133         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
134                                         GeccErrCntCsSel, 1);
135         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
136
137         ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
138         *error_count +=
139                 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_GeccErrCnt, GeccErrCnt) -
140                  UMC_V8_7_CE_CNT_INIT);
141
142         /* check for SRAM correctable error
143           MCUMC_STATUS is a 64 bit register */
144         mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
145         if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 6 &&
146             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
147             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
148                 *error_count += 1;
149 }
150
151 static void umc_v8_7_querry_uncorrectable_error_count(struct amdgpu_device *adev,
152                                                       uint32_t umc_reg_offset,
153                                                       unsigned long *error_count)
154 {
155         uint64_t mc_umc_status;
156         uint32_t mc_umc_status_addr;
157
158         mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
159
160         /* check the MCUMC_STATUS */
161         mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
162         if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
163             (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
164             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
165             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
166             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
167             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
168                 *error_count += 1;
169 }
170
171 static void umc_v8_7_query_ras_error_count(struct amdgpu_device *adev,
172                                            void *ras_error_status)
173 {
174         struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
175
176         uint32_t umc_inst        = 0;
177         uint32_t ch_inst         = 0;
178         uint32_t umc_reg_offset  = 0;
179
180         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
181                 umc_reg_offset = get_umc_8_reg_offset(adev,
182                                                       umc_inst,
183                                                       ch_inst);
184
185                 umc_v8_7_query_correctable_error_count(adev,
186                                                        umc_reg_offset,
187                                                        &(err_data->ce_count));
188                 umc_v8_7_querry_uncorrectable_error_count(adev,
189                                                           umc_reg_offset,
190                                                           &(err_data->ue_count));
191         }
192
193         umc_v8_7_clear_error_count(adev);
194 }
195
196 static void umc_v8_7_query_error_address(struct amdgpu_device *adev,
197                                          struct ras_err_data *err_data,
198                                          uint32_t umc_reg_offset,
199                                          uint32_t ch_inst,
200                                          uint32_t umc_inst)
201 {
202         uint32_t lsb, mc_umc_status_addr;
203         uint64_t mc_umc_status, err_addr, retired_page, mc_umc_addrt0;
204         struct eeprom_table_record *err_rec;
205         uint32_t channel_index = adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
206
207         mc_umc_status_addr =
208                 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
209         mc_umc_addrt0 =
210                 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_ADDRT0);
211
212         mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
213
214         if (mc_umc_status == 0)
215                 return;
216
217         if (!err_data->err_addr) {
218                 /* clear umc status */
219                 WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
220                 return;
221         }
222
223         err_rec = &err_data->err_addr[err_data->err_addr_cnt];
224
225         /* calculate error address if ue/ce error is detected */
226         if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
227             (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
228             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
229
230                 err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
231                 /* the lowest lsb bits should be ignored */
232                 lsb = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, LSB);
233                 err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
234                 err_addr &= ~((0x1ULL << lsb) - 1);
235
236                 /* translate umc channel address to soc pa, 3 parts are included */
237                 retired_page = ADDR_OF_4KB_BLOCK(err_addr) |
238                                 ADDR_OF_256B_BLOCK(channel_index) |
239                                 OFFSET_IN_256B_BLOCK(err_addr);
240
241                 /* we only save ue error information currently, ce is skipped */
242                 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
243                                 == 1) {
244                         err_rec->address = err_addr;
245                         /* page frame address is saved */
246                         err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
247                         err_rec->ts = (uint64_t)ktime_get_real_seconds();
248                         err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
249                         err_rec->cu = 0;
250                         err_rec->mem_channel = channel_index;
251                         err_rec->mcumc_id = umc_inst;
252
253                         err_data->err_addr_cnt++;
254                 }
255         }
256
257         /* clear umc status */
258         WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
259 }
260
261 static void umc_v8_7_query_ras_error_address(struct amdgpu_device *adev,
262                                              void *ras_error_status)
263 {
264         struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
265
266         uint32_t umc_inst        = 0;
267         uint32_t ch_inst         = 0;
268         uint32_t umc_reg_offset  = 0;
269
270         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
271                 umc_reg_offset = get_umc_8_reg_offset(adev,
272                                                       umc_inst,
273                                                       ch_inst);
274
275                 umc_v8_7_query_error_address(adev,
276                                              err_data,
277                                              umc_reg_offset,
278                                              ch_inst,
279                                              umc_inst);
280         }
281 }
282
283 static void umc_v8_7_err_cnt_init_per_channel(struct amdgpu_device *adev,
284                                               uint32_t umc_reg_offset)
285 {
286         uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
287         uint32_t ecc_err_cnt_addr;
288
289         ecc_err_cnt_sel_addr =
290                 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
291         ecc_err_cnt_addr =
292                 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
293
294         /* select the lower chip and check the error count */
295         ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
296         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
297                                         GeccErrCntCsSel, 0);
298         /* set ce error interrupt type to APIC based interrupt */
299         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
300                                         GeccErrInt, 0x1);
301         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
302         /* set error count to initial value */
303         WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_7_CE_CNT_INIT);
304
305         /* select the higher chip and check the err counter */
306         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
307                                         GeccErrCntCsSel, 1);
308         WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
309         WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_7_CE_CNT_INIT);
310 }
311
312 static void umc_v8_7_err_cnt_init(struct amdgpu_device *adev)
313 {
314         uint32_t umc_inst        = 0;
315         uint32_t ch_inst         = 0;
316         uint32_t umc_reg_offset  = 0;
317
318         LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
319                 umc_reg_offset = get_umc_8_reg_offset(adev,
320                                                       umc_inst,
321                                                       ch_inst);
322
323                 umc_v8_7_err_cnt_init_per_channel(adev, umc_reg_offset);
324         }
325 }
326
327 const struct amdgpu_umc_ras_funcs umc_v8_7_ras_funcs = {
328         .err_cnt_init = umc_v8_7_err_cnt_init,
329         .ras_late_init = amdgpu_umc_ras_late_init,
330         .ras_fini = amdgpu_umc_ras_fini,
331         .query_ras_error_count = umc_v8_7_query_ras_error_count,
332         .query_ras_error_address = umc_v8_7_query_ras_error_address,
333 };