1/*
2 * Copyright 2019 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 */
23
24#include "amdgpu.h"
25#include "umc_v6_7.h"
26
27static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
28				    struct ras_err_data *err_data, uint64_t err_addr,
29				    uint32_t ch_inst, uint32_t umc_inst)
30{
31	switch (adev->ip_versions[UMC_HWIP][0]) {
32	case IP_VERSION(6, 7, 0):
33		umc_v6_7_convert_error_address(adev,
34				err_data, err_addr, ch_inst, umc_inst);
35		break;
36	default:
37		dev_warn(adev->dev,
38			 "UMC address to Physical address translation is not supported\n");
39		return AMDGPU_RAS_FAIL;
40	}
41
42	return AMDGPU_RAS_SUCCESS;
43}
44
45int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
46			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst)
47{
48	struct ras_err_data err_data = {0, 0, 0, NULL};
49	int ret = AMDGPU_RAS_FAIL;
50
51	err_data.err_addr =
52		kcalloc(adev->umc.max_ras_err_cnt_per_query,
53			sizeof(struct eeprom_table_record), GFP_KERNEL);
54	if (!err_data.err_addr) {
55		dev_warn(adev->dev,
56			"Failed to alloc memory for umc error record in MCA notifier!\n");
57		return AMDGPU_RAS_FAIL;
58	}
59
60	/*
61	 * Translate UMC channel address to Physical address
62	 */
63	ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
64					ch_inst, umc_inst);
65	if (ret)
66		goto out;
67
68	if (amdgpu_bad_page_threshold != 0) {
69		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
70						err_data.err_addr_cnt);
71		amdgpu_ras_save_bad_pages(adev, NULL);
72	}
73
74out:
75	kfree(err_data.err_addr);
76	return ret;
77}
78
79static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
80		void *ras_error_status,
81		struct amdgpu_iv_entry *entry,
82		bool reset)
83{
84	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
85	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
86	int ret = 0;
87
88	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
89	ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
90	if (ret == -EOPNOTSUPP) {
91		if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
92		    adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
93		    adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status);
94
95		if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
96		    adev->umc.ras->ras_block.hw_ops->query_ras_error_address &&
97		    adev->umc.max_ras_err_cnt_per_query) {
98			err_data->err_addr =
99				kcalloc(adev->umc.max_ras_err_cnt_per_query,
100					sizeof(struct eeprom_table_record), GFP_KERNEL);
101
102			/* still call query_ras_error_address to clear error status
103			 * even NOMEM error is encountered
104			 */
105			if(!err_data->err_addr)
106				dev_warn(adev->dev, "Failed to alloc memory for "
107						"umc error address record!\n");
108
109			/* umc query_ras_error_address is also responsible for clearing
110			 * error status
111			 */
112			adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, ras_error_status);
113		}
114	} else if (!ret) {
115		if (adev->umc.ras &&
116		    adev->umc.ras->ecc_info_query_ras_error_count)
117		    adev->umc.ras->ecc_info_query_ras_error_count(adev, ras_error_status);
118
119		if (adev->umc.ras &&
120		    adev->umc.ras->ecc_info_query_ras_error_address &&
121		    adev->umc.max_ras_err_cnt_per_query) {
122			err_data->err_addr =
123				kcalloc(adev->umc.max_ras_err_cnt_per_query,
124					sizeof(struct eeprom_table_record), GFP_KERNEL);
125
126			/* still call query_ras_error_address to clear error status
127			 * even NOMEM error is encountered
128			 */
129			if(!err_data->err_addr)
130				dev_warn(adev->dev, "Failed to alloc memory for "
131						"umc error address record!\n");
132
133			/* umc query_ras_error_address is also responsible for clearing
134			 * error status
135			 */
136			adev->umc.ras->ecc_info_query_ras_error_address(adev, ras_error_status);
137		}
138	}
139
140	/* only uncorrectable error needs gpu reset */
141	if (err_data->ue_count) {
142		dev_info(adev->dev, "%ld uncorrectable hardware errors "
143				"detected in UMC block\n",
144				err_data->ue_count);
145
146		if ((amdgpu_bad_page_threshold != 0) &&
147			err_data->err_addr_cnt) {
148			amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
149						err_data->err_addr_cnt);
150			amdgpu_ras_save_bad_pages(adev, &(err_data->ue_count));
151
152			amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
153
154			if (con->update_channel_flag == true) {
155				amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
156				con->update_channel_flag = false;
157			}
158		}
159
160		if (reset)
161			amdgpu_ras_reset_gpu(adev);
162	}
163
164	kfree(err_data->err_addr);
165	return AMDGPU_RAS_SUCCESS;
166}
167
168int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
169{
170	int ret = AMDGPU_RAS_SUCCESS;
171
172	if (adev->gmc.xgmi.connected_to_cpu ||
173		adev->gmc.is_app_apu) {
174		if (reset) {
175			/* MCA poison handler is only responsible for GPU reset,
176			 * let MCA notifier do page retirement.
177			 */
178			kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
179			amdgpu_ras_reset_gpu(adev);
180		}
181		return ret;
182	}
183
184	if (!amdgpu_sriov_vf(adev)) {
185		struct ras_err_data err_data = {0, 0, 0, NULL};
186		struct ras_common_if head = {
187			.block = AMDGPU_RAS_BLOCK__UMC,
188		};
189		struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
190
191		ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
192
193		if (ret == AMDGPU_RAS_SUCCESS && obj) {
194			obj->err_data.ue_count += err_data.ue_count;
195			obj->err_data.ce_count += err_data.ce_count;
196		}
197	} else {
198		if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
199			adev->virt.ops->ras_poison_handler(adev);
200		else
201			dev_warn(adev->dev,
202				"No ras_poison_handler interface in SRIOV!\n");
203	}
204
205	return ret;
206}
207
208int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
209		void *ras_error_status,
210		struct amdgpu_iv_entry *entry)
211{
212	return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
213}
214
215int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev)
216{
217	int err;
218	struct amdgpu_umc_ras *ras;
219
220	if (!adev->umc.ras)
221		return 0;
222
223	ras = adev->umc.ras;
224
225	err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
226	if (err) {
227		dev_err(adev->dev, "Failed to register umc ras block!\n");
228		return err;
229	}
230
231	strcpy(adev->umc.ras->ras_block.ras_comm.name, "umc");
232	ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__UMC;
233	ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
234	adev->umc.ras_if = &ras->ras_block.ras_comm;
235
236	if (!ras->ras_block.ras_late_init)
237		ras->ras_block.ras_late_init = amdgpu_umc_ras_late_init;
238
239	if (!ras->ras_block.ras_cb)
240		ras->ras_block.ras_cb = amdgpu_umc_process_ras_data_cb;
241
242	return 0;
243}
244
245int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
246{
247	int r;
248
249	r = amdgpu_ras_block_late_init(adev, ras_block);
250	if (r)
251		return r;
252
253	if (amdgpu_ras_is_supported(adev, ras_block->block)) {
254		r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
255		if (r)
256			goto late_fini;
257	}
258
259	/* ras init of specific umc version */
260	if (adev->umc.ras &&
261	    adev->umc.ras->err_cnt_init)
262		adev->umc.ras->err_cnt_init(adev);
263
264	return 0;
265
266late_fini:
267	amdgpu_ras_block_late_fini(adev, ras_block);
268	return r;
269}
270
271int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
272		struct amdgpu_irq_src *source,
273		struct amdgpu_iv_entry *entry)
274{
275	struct ras_common_if *ras_if = adev->umc.ras_if;
276	struct ras_dispatch_if ih_data = {
277		.entry = entry,
278	};
279
280	if (!ras_if)
281		return 0;
282
283	ih_data.head = *ras_if;
284
285	amdgpu_ras_interrupt_dispatch(adev, &ih_data);
286	return 0;
287}
288
289void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
290		uint64_t err_addr,
291		uint64_t retired_page,
292		uint32_t channel_index,
293		uint32_t umc_inst)
294{
295	struct eeprom_table_record *err_rec =
296		&err_data->err_addr[err_data->err_addr_cnt];
297
298	err_rec->address = err_addr;
299	/* page frame address is saved */
300	err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
301	err_rec->ts = (uint64_t)ktime_get_real_seconds();
302	err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
303	err_rec->cu = 0;
304	err_rec->mem_channel = channel_index;
305	err_rec->mcumc_id = umc_inst;
306
307	err_data->err_addr_cnt++;
308}
309
310int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
311			umc_func func, void *data)
312{
313	uint32_t node_inst       = 0;
314	uint32_t umc_inst        = 0;
315	uint32_t ch_inst         = 0;
316	int ret = 0;
317
318	if (adev->umc.node_inst_num) {
319		LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) {
320			ret = func(adev, node_inst, umc_inst, ch_inst, data);
321			if (ret) {
322				dev_err(adev->dev, "Node %d umc %d ch %d func returns %d\n",
323					node_inst, umc_inst, ch_inst, ret);
324				return ret;
325			}
326		}
327	} else {
328		LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
329			ret = func(adev, 0, umc_inst, ch_inst, data);
330			if (ret) {
331				dev_err(adev->dev, "Umc %d ch %d func returns %d\n",
332					umc_inst, ch_inst, ret);
333				return ret;
334			}
335		}
336	}
337
338	return 0;
339}
340