18c2ecf20Sopenharmony_ci/*
28c2ecf20Sopenharmony_ci * Copyright 2019 Advanced Micro Devices, Inc.
38c2ecf20Sopenharmony_ci *
48c2ecf20Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
58c2ecf20Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
68c2ecf20Sopenharmony_ci * to deal in the Software without restriction, including without limitation
78c2ecf20Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
88c2ecf20Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
98c2ecf20Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
108c2ecf20Sopenharmony_ci *
118c2ecf20Sopenharmony_ci * The above copyright notice and this permission notice shall be included in
128c2ecf20Sopenharmony_ci * all copies or substantial portions of the Software.
138c2ecf20Sopenharmony_ci *
148c2ecf20Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
158c2ecf20Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
168c2ecf20Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
178c2ecf20Sopenharmony_ci * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
188c2ecf20Sopenharmony_ci * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
198c2ecf20Sopenharmony_ci * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
208c2ecf20Sopenharmony_ci * OTHER DEALINGS IN THE SOFTWARE.
218c2ecf20Sopenharmony_ci *
228c2ecf20Sopenharmony_ci */
238c2ecf20Sopenharmony_ci
248c2ecf20Sopenharmony_ci#include "amdgpu_ras.h"
258c2ecf20Sopenharmony_ci
268c2ecf20Sopenharmony_ciint amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
278c2ecf20Sopenharmony_ci{
288c2ecf20Sopenharmony_ci	int r;
298c2ecf20Sopenharmony_ci	struct ras_fs_if fs_info = {
308c2ecf20Sopenharmony_ci		.sysfs_name = "umc_err_count",
318c2ecf20Sopenharmony_ci	};
328c2ecf20Sopenharmony_ci	struct ras_ih_if ih_info = {
338c2ecf20Sopenharmony_ci		.cb = amdgpu_umc_process_ras_data_cb,
348c2ecf20Sopenharmony_ci	};
358c2ecf20Sopenharmony_ci
368c2ecf20Sopenharmony_ci	if (!adev->umc.ras_if) {
378c2ecf20Sopenharmony_ci		adev->umc.ras_if =
388c2ecf20Sopenharmony_ci			kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
398c2ecf20Sopenharmony_ci		if (!adev->umc.ras_if)
408c2ecf20Sopenharmony_ci			return -ENOMEM;
418c2ecf20Sopenharmony_ci		adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
428c2ecf20Sopenharmony_ci		adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
438c2ecf20Sopenharmony_ci		adev->umc.ras_if->sub_block_index = 0;
448c2ecf20Sopenharmony_ci		strcpy(adev->umc.ras_if->name, "umc");
458c2ecf20Sopenharmony_ci	}
468c2ecf20Sopenharmony_ci	ih_info.head = fs_info.head = *adev->umc.ras_if;
478c2ecf20Sopenharmony_ci
488c2ecf20Sopenharmony_ci	r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
498c2ecf20Sopenharmony_ci				 &fs_info, &ih_info);
508c2ecf20Sopenharmony_ci	if (r)
518c2ecf20Sopenharmony_ci		goto free;
528c2ecf20Sopenharmony_ci
538c2ecf20Sopenharmony_ci	if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
548c2ecf20Sopenharmony_ci		r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
558c2ecf20Sopenharmony_ci		if (r)
568c2ecf20Sopenharmony_ci			goto late_fini;
578c2ecf20Sopenharmony_ci	} else {
588c2ecf20Sopenharmony_ci		r = 0;
598c2ecf20Sopenharmony_ci		goto free;
608c2ecf20Sopenharmony_ci	}
618c2ecf20Sopenharmony_ci
628c2ecf20Sopenharmony_ci	/* ras init of specific umc version */
638c2ecf20Sopenharmony_ci	if (adev->umc.funcs && adev->umc.funcs->err_cnt_init)
648c2ecf20Sopenharmony_ci		adev->umc.funcs->err_cnt_init(adev);
658c2ecf20Sopenharmony_ci
668c2ecf20Sopenharmony_ci	return 0;
678c2ecf20Sopenharmony_ci
688c2ecf20Sopenharmony_cilate_fini:
698c2ecf20Sopenharmony_ci	amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
708c2ecf20Sopenharmony_cifree:
718c2ecf20Sopenharmony_ci	kfree(adev->umc.ras_if);
728c2ecf20Sopenharmony_ci	adev->umc.ras_if = NULL;
738c2ecf20Sopenharmony_ci	return r;
748c2ecf20Sopenharmony_ci}
758c2ecf20Sopenharmony_ci
768c2ecf20Sopenharmony_civoid amdgpu_umc_ras_fini(struct amdgpu_device *adev)
778c2ecf20Sopenharmony_ci{
788c2ecf20Sopenharmony_ci	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
798c2ecf20Sopenharmony_ci			adev->umc.ras_if) {
808c2ecf20Sopenharmony_ci		struct ras_common_if *ras_if = adev->umc.ras_if;
818c2ecf20Sopenharmony_ci		struct ras_ih_if ih_info = {
828c2ecf20Sopenharmony_ci			.head = *ras_if,
838c2ecf20Sopenharmony_ci			.cb = amdgpu_umc_process_ras_data_cb,
848c2ecf20Sopenharmony_ci		};
858c2ecf20Sopenharmony_ci
868c2ecf20Sopenharmony_ci		amdgpu_ras_late_fini(adev, ras_if, &ih_info);
878c2ecf20Sopenharmony_ci		kfree(ras_if);
888c2ecf20Sopenharmony_ci	}
898c2ecf20Sopenharmony_ci}
908c2ecf20Sopenharmony_ci
918c2ecf20Sopenharmony_ciint amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
928c2ecf20Sopenharmony_ci		void *ras_error_status,
938c2ecf20Sopenharmony_ci		struct amdgpu_iv_entry *entry)
948c2ecf20Sopenharmony_ci{
958c2ecf20Sopenharmony_ci	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
968c2ecf20Sopenharmony_ci
978c2ecf20Sopenharmony_ci	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
988c2ecf20Sopenharmony_ci	if (adev->umc.funcs &&
998c2ecf20Sopenharmony_ci	    adev->umc.funcs->query_ras_error_count)
1008c2ecf20Sopenharmony_ci	    adev->umc.funcs->query_ras_error_count(adev, ras_error_status);
1018c2ecf20Sopenharmony_ci
1028c2ecf20Sopenharmony_ci	if (adev->umc.funcs &&
1038c2ecf20Sopenharmony_ci	    adev->umc.funcs->query_ras_error_address &&
1048c2ecf20Sopenharmony_ci	    adev->umc.max_ras_err_cnt_per_query) {
1058c2ecf20Sopenharmony_ci		err_data->err_addr =
1068c2ecf20Sopenharmony_ci			kcalloc(adev->umc.max_ras_err_cnt_per_query,
1078c2ecf20Sopenharmony_ci				sizeof(struct eeprom_table_record), GFP_KERNEL);
1088c2ecf20Sopenharmony_ci
1098c2ecf20Sopenharmony_ci		/* still call query_ras_error_address to clear error status
1108c2ecf20Sopenharmony_ci		 * even NOMEM error is encountered
1118c2ecf20Sopenharmony_ci		 */
1128c2ecf20Sopenharmony_ci		if(!err_data->err_addr)
1138c2ecf20Sopenharmony_ci			dev_warn(adev->dev, "Failed to alloc memory for "
1148c2ecf20Sopenharmony_ci					"umc error address record!\n");
1158c2ecf20Sopenharmony_ci
1168c2ecf20Sopenharmony_ci		/* umc query_ras_error_address is also responsible for clearing
1178c2ecf20Sopenharmony_ci		 * error status
1188c2ecf20Sopenharmony_ci		 */
1198c2ecf20Sopenharmony_ci		adev->umc.funcs->query_ras_error_address(adev, ras_error_status);
1208c2ecf20Sopenharmony_ci	}
1218c2ecf20Sopenharmony_ci
1228c2ecf20Sopenharmony_ci	/* only uncorrectable error needs gpu reset */
1238c2ecf20Sopenharmony_ci	if (err_data->ue_count) {
1248c2ecf20Sopenharmony_ci		dev_info(adev->dev, "%ld uncorrectable hardware errors "
1258c2ecf20Sopenharmony_ci				"detected in UMC block\n",
1268c2ecf20Sopenharmony_ci				err_data->ue_count);
1278c2ecf20Sopenharmony_ci
1288c2ecf20Sopenharmony_ci		if ((amdgpu_bad_page_threshold != 0) &&
1298c2ecf20Sopenharmony_ci			err_data->err_addr_cnt &&
1308c2ecf20Sopenharmony_ci			amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
1318c2ecf20Sopenharmony_ci						err_data->err_addr_cnt))
1328c2ecf20Sopenharmony_ci			dev_warn(adev->dev, "Failed to add ras bad page!\n");
1338c2ecf20Sopenharmony_ci
1348c2ecf20Sopenharmony_ci		amdgpu_ras_reset_gpu(adev);
1358c2ecf20Sopenharmony_ci	}
1368c2ecf20Sopenharmony_ci
1378c2ecf20Sopenharmony_ci	kfree(err_data->err_addr);
1388c2ecf20Sopenharmony_ci	return AMDGPU_RAS_SUCCESS;
1398c2ecf20Sopenharmony_ci}
1408c2ecf20Sopenharmony_ci
1418c2ecf20Sopenharmony_ciint amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
1428c2ecf20Sopenharmony_ci		struct amdgpu_irq_src *source,
1438c2ecf20Sopenharmony_ci		struct amdgpu_iv_entry *entry)
1448c2ecf20Sopenharmony_ci{
1458c2ecf20Sopenharmony_ci	struct ras_common_if *ras_if = adev->umc.ras_if;
1468c2ecf20Sopenharmony_ci	struct ras_dispatch_if ih_data = {
1478c2ecf20Sopenharmony_ci		.entry = entry,
1488c2ecf20Sopenharmony_ci	};
1498c2ecf20Sopenharmony_ci
1508c2ecf20Sopenharmony_ci	if (!ras_if)
1518c2ecf20Sopenharmony_ci		return 0;
1528c2ecf20Sopenharmony_ci
1538c2ecf20Sopenharmony_ci	ih_data.head = *ras_if;
1548c2ecf20Sopenharmony_ci
1558c2ecf20Sopenharmony_ci	amdgpu_ras_interrupt_dispatch(adev, &ih_data);
1568c2ecf20Sopenharmony_ci	return 0;
1578c2ecf20Sopenharmony_ci}
158