162306a36Sopenharmony_ci/*
262306a36Sopenharmony_ci * Copyright 2018 Advanced Micro Devices, Inc.
362306a36Sopenharmony_ci *
462306a36Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
562306a36Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
662306a36Sopenharmony_ci * to deal in the Software without restriction, including without limitation
762306a36Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
862306a36Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
962306a36Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
1062306a36Sopenharmony_ci *
1162306a36Sopenharmony_ci * The above copyright notice and this permission notice shall be included in
1262306a36Sopenharmony_ci * all copies or substantial portions of the Software.
1362306a36Sopenharmony_ci *
1462306a36Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1562306a36Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1662306a36Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
1762306a36Sopenharmony_ci * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
1862306a36Sopenharmony_ci * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
1962306a36Sopenharmony_ci * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
2062306a36Sopenharmony_ci * OTHER DEALINGS IN THE SOFTWARE.
2162306a36Sopenharmony_ci *
2262306a36Sopenharmony_ci *
2362306a36Sopenharmony_ci */
2462306a36Sopenharmony_ci#include <linux/list.h>
2562306a36Sopenharmony_ci#include "amdgpu.h"
2662306a36Sopenharmony_ci#include "amdgpu_xgmi.h"
2762306a36Sopenharmony_ci#include "amdgpu_ras.h"
2862306a36Sopenharmony_ci#include "soc15.h"
2962306a36Sopenharmony_ci#include "df/df_3_6_offset.h"
3062306a36Sopenharmony_ci#include "xgmi/xgmi_4_0_0_smn.h"
3162306a36Sopenharmony_ci#include "xgmi/xgmi_4_0_0_sh_mask.h"
3262306a36Sopenharmony_ci#include "xgmi/xgmi_6_1_0_sh_mask.h"
3362306a36Sopenharmony_ci#include "wafl/wafl2_4_0_0_smn.h"
3462306a36Sopenharmony_ci#include "wafl/wafl2_4_0_0_sh_mask.h"
3562306a36Sopenharmony_ci
3662306a36Sopenharmony_ci#include "amdgpu_reset.h"
3762306a36Sopenharmony_ci
3862306a36Sopenharmony_ci#define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c
3962306a36Sopenharmony_ci#define smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK   0x11a00218
4062306a36Sopenharmony_ci#define smnPCS_GOPX1_PCS_ERROR_STATUS    0x12200210
4162306a36Sopenharmony_ci#define smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK      0x12200218
4262306a36Sopenharmony_ci
4362306a36Sopenharmony_cistatic DEFINE_MUTEX(xgmi_mutex);
4462306a36Sopenharmony_ci
4562306a36Sopenharmony_ci#define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE		4
4662306a36Sopenharmony_ci
4762306a36Sopenharmony_cistatic LIST_HEAD(xgmi_hive_list);
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_cistatic const int xgmi_pcs_err_status_reg_vg20[] = {
5062306a36Sopenharmony_ci	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
5162306a36Sopenharmony_ci	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
5262306a36Sopenharmony_ci};
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_cistatic const int wafl_pcs_err_status_reg_vg20[] = {
5562306a36Sopenharmony_ci	smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
5662306a36Sopenharmony_ci	smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
5762306a36Sopenharmony_ci};
5862306a36Sopenharmony_ci
5962306a36Sopenharmony_cistatic const int xgmi_pcs_err_status_reg_arct[] = {
6062306a36Sopenharmony_ci	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
6162306a36Sopenharmony_ci	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
6262306a36Sopenharmony_ci	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000,
6362306a36Sopenharmony_ci	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000,
6462306a36Sopenharmony_ci	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000,
6562306a36Sopenharmony_ci	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000,
6662306a36Sopenharmony_ci};
6762306a36Sopenharmony_ci
6862306a36Sopenharmony_ci/* same as vg20*/
6962306a36Sopenharmony_cistatic const int wafl_pcs_err_status_reg_arct[] = {
7062306a36Sopenharmony_ci	smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
7162306a36Sopenharmony_ci	smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
7262306a36Sopenharmony_ci};
7362306a36Sopenharmony_ci
7462306a36Sopenharmony_cistatic const int xgmi3x16_pcs_err_status_reg_aldebaran[] = {
7562306a36Sopenharmony_ci	smnPCS_XGMI3X16_PCS_ERROR_STATUS,
7662306a36Sopenharmony_ci	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000,
7762306a36Sopenharmony_ci	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x200000,
7862306a36Sopenharmony_ci	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x300000,
7962306a36Sopenharmony_ci	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x400000,
8062306a36Sopenharmony_ci	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x500000,
8162306a36Sopenharmony_ci	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x600000,
8262306a36Sopenharmony_ci	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000
8362306a36Sopenharmony_ci};
8462306a36Sopenharmony_ci
8562306a36Sopenharmony_cistatic const int xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[] = {
8662306a36Sopenharmony_ci	smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK,
8762306a36Sopenharmony_ci	smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000,
8862306a36Sopenharmony_ci	smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x200000,
8962306a36Sopenharmony_ci	smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x300000,
9062306a36Sopenharmony_ci	smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x400000,
9162306a36Sopenharmony_ci	smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x500000,
9262306a36Sopenharmony_ci	smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x600000,
9362306a36Sopenharmony_ci	smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x700000
9462306a36Sopenharmony_ci};
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_cistatic const int walf_pcs_err_status_reg_aldebaran[] = {
9762306a36Sopenharmony_ci	smnPCS_GOPX1_PCS_ERROR_STATUS,
9862306a36Sopenharmony_ci	smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000
9962306a36Sopenharmony_ci};
10062306a36Sopenharmony_ci
10162306a36Sopenharmony_cistatic const int walf_pcs_err_noncorrectable_mask_reg_aldebaran[] = {
10262306a36Sopenharmony_ci	smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK,
10362306a36Sopenharmony_ci	smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000
10462306a36Sopenharmony_ci};
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_cistatic const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
10762306a36Sopenharmony_ci	{"XGMI PCS DataLossErr",
10862306a36Sopenharmony_ci	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
10962306a36Sopenharmony_ci	{"XGMI PCS TrainingErr",
11062306a36Sopenharmony_ci	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)},
11162306a36Sopenharmony_ci	{"XGMI PCS CRCErr",
11262306a36Sopenharmony_ci	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)},
11362306a36Sopenharmony_ci	{"XGMI PCS BERExceededErr",
11462306a36Sopenharmony_ci	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)},
11562306a36Sopenharmony_ci	{"XGMI PCS TxMetaDataErr",
11662306a36Sopenharmony_ci	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)},
11762306a36Sopenharmony_ci	{"XGMI PCS ReplayBufParityErr",
11862306a36Sopenharmony_ci	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)},
11962306a36Sopenharmony_ci	{"XGMI PCS DataParityErr",
12062306a36Sopenharmony_ci	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)},
12162306a36Sopenharmony_ci	{"XGMI PCS ReplayFifoOverflowErr",
12262306a36Sopenharmony_ci	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
12362306a36Sopenharmony_ci	{"XGMI PCS ReplayFifoUnderflowErr",
12462306a36Sopenharmony_ci	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
12562306a36Sopenharmony_ci	{"XGMI PCS ElasticFifoOverflowErr",
12662306a36Sopenharmony_ci	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
12762306a36Sopenharmony_ci	{"XGMI PCS DeskewErr",
12862306a36Sopenharmony_ci	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)},
12962306a36Sopenharmony_ci	{"XGMI PCS DataStartupLimitErr",
13062306a36Sopenharmony_ci	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)},
13162306a36Sopenharmony_ci	{"XGMI PCS FCInitTimeoutErr",
13262306a36Sopenharmony_ci	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)},
13362306a36Sopenharmony_ci	{"XGMI PCS RecoveryTimeoutErr",
13462306a36Sopenharmony_ci	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
13562306a36Sopenharmony_ci	{"XGMI PCS ReadySerialTimeoutErr",
13662306a36Sopenharmony_ci	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
13762306a36Sopenharmony_ci	{"XGMI PCS ReadySerialAttemptErr",
13862306a36Sopenharmony_ci	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
13962306a36Sopenharmony_ci	{"XGMI PCS RecoveryAttemptErr",
14062306a36Sopenharmony_ci	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)},
14162306a36Sopenharmony_ci	{"XGMI PCS RecoveryRelockAttemptErr",
14262306a36Sopenharmony_ci	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
14362306a36Sopenharmony_ci};
14462306a36Sopenharmony_ci
14562306a36Sopenharmony_cistatic const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = {
14662306a36Sopenharmony_ci	{"WAFL PCS DataLossErr",
14762306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)},
14862306a36Sopenharmony_ci	{"WAFL PCS TrainingErr",
14962306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)},
15062306a36Sopenharmony_ci	{"WAFL PCS CRCErr",
15162306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)},
15262306a36Sopenharmony_ci	{"WAFL PCS BERExceededErr",
15362306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)},
15462306a36Sopenharmony_ci	{"WAFL PCS TxMetaDataErr",
15562306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)},
15662306a36Sopenharmony_ci	{"WAFL PCS ReplayBufParityErr",
15762306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)},
15862306a36Sopenharmony_ci	{"WAFL PCS DataParityErr",
15962306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)},
16062306a36Sopenharmony_ci	{"WAFL PCS ReplayFifoOverflowErr",
16162306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
16262306a36Sopenharmony_ci	{"WAFL PCS ReplayFifoUnderflowErr",
16362306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
16462306a36Sopenharmony_ci	{"WAFL PCS ElasticFifoOverflowErr",
16562306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
16662306a36Sopenharmony_ci	{"WAFL PCS DeskewErr",
16762306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)},
16862306a36Sopenharmony_ci	{"WAFL PCS DataStartupLimitErr",
16962306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)},
17062306a36Sopenharmony_ci	{"WAFL PCS FCInitTimeoutErr",
17162306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)},
17262306a36Sopenharmony_ci	{"WAFL PCS RecoveryTimeoutErr",
17362306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
17462306a36Sopenharmony_ci	{"WAFL PCS ReadySerialTimeoutErr",
17562306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
17662306a36Sopenharmony_ci	{"WAFL PCS ReadySerialAttemptErr",
17762306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
17862306a36Sopenharmony_ci	{"WAFL PCS RecoveryAttemptErr",
17962306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)},
18062306a36Sopenharmony_ci	{"WAFL PCS RecoveryRelockAttemptErr",
18162306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
18262306a36Sopenharmony_ci};
18362306a36Sopenharmony_ci
18462306a36Sopenharmony_cistatic const struct amdgpu_pcs_ras_field xgmi3x16_pcs_ras_fields[] = {
18562306a36Sopenharmony_ci	{"XGMI3X16 PCS DataLossErr",
18662306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataLossErr)},
18762306a36Sopenharmony_ci	{"XGMI3X16 PCS TrainingErr",
18862306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TrainingErr)},
18962306a36Sopenharmony_ci	{"XGMI3X16 PCS FlowCtrlAckErr",
19062306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FlowCtrlAckErr)},
19162306a36Sopenharmony_ci	{"XGMI3X16 PCS RxFifoUnderflowErr",
19262306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxFifoUnderflowErr)},
19362306a36Sopenharmony_ci	{"XGMI3X16 PCS RxFifoOverflowErr",
19462306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxFifoOverflowErr)},
19562306a36Sopenharmony_ci	{"XGMI3X16 PCS CRCErr",
19662306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, CRCErr)},
19762306a36Sopenharmony_ci	{"XGMI3X16 PCS BERExceededErr",
19862306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, BERExceededErr)},
19962306a36Sopenharmony_ci	{"XGMI3X16 PCS TxVcidDataErr",
20062306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TxVcidDataErr)},
20162306a36Sopenharmony_ci	{"XGMI3X16 PCS ReplayBufParityErr",
20262306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayBufParityErr)},
20362306a36Sopenharmony_ci	{"XGMI3X16 PCS DataParityErr",
20462306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataParityErr)},
20562306a36Sopenharmony_ci	{"XGMI3X16 PCS ReplayFifoOverflowErr",
20662306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
20762306a36Sopenharmony_ci	{"XGMI3X16 PCS ReplayFifoUnderflowErr",
20862306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
20962306a36Sopenharmony_ci	{"XGMI3X16 PCS ElasticFifoOverflowErr",
21062306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
21162306a36Sopenharmony_ci	{"XGMI3X16 PCS DeskewErr",
21262306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DeskewErr)},
21362306a36Sopenharmony_ci	{"XGMI3X16 PCS FlowCtrlCRCErr",
21462306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FlowCtrlCRCErr)},
21562306a36Sopenharmony_ci	{"XGMI3X16 PCS DataStartupLimitErr",
21662306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataStartupLimitErr)},
21762306a36Sopenharmony_ci	{"XGMI3X16 PCS FCInitTimeoutErr",
21862306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FCInitTimeoutErr)},
21962306a36Sopenharmony_ci	{"XGMI3X16 PCS RecoveryTimeoutErr",
22062306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
22162306a36Sopenharmony_ci	{"XGMI3X16 PCS ReadySerialTimeoutErr",
22262306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
22362306a36Sopenharmony_ci	{"XGMI3X16 PCS ReadySerialAttemptErr",
22462306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
22562306a36Sopenharmony_ci	{"XGMI3X16 PCS RecoveryAttemptErr",
22662306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryAttemptErr)},
22762306a36Sopenharmony_ci	{"XGMI3X16 PCS RecoveryRelockAttemptErr",
22862306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
22962306a36Sopenharmony_ci	{"XGMI3X16 PCS ReplayAttemptErr",
23062306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayAttemptErr)},
23162306a36Sopenharmony_ci	{"XGMI3X16 PCS SyncHdrErr",
23262306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, SyncHdrErr)},
23362306a36Sopenharmony_ci	{"XGMI3X16 PCS TxReplayTimeoutErr",
23462306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TxReplayTimeoutErr)},
23562306a36Sopenharmony_ci	{"XGMI3X16 PCS RxReplayTimeoutErr",
23662306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxReplayTimeoutErr)},
23762306a36Sopenharmony_ci	{"XGMI3X16 PCS LinkSubTxTimeoutErr",
23862306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, LinkSubTxTimeoutErr)},
23962306a36Sopenharmony_ci	{"XGMI3X16 PCS LinkSubRxTimeoutErr",
24062306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, LinkSubRxTimeoutErr)},
24162306a36Sopenharmony_ci	{"XGMI3X16 PCS RxCMDPktErr",
24262306a36Sopenharmony_ci	 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxCMDPktErr)},
24362306a36Sopenharmony_ci};
24462306a36Sopenharmony_ci
24562306a36Sopenharmony_ci/**
24662306a36Sopenharmony_ci * DOC: AMDGPU XGMI Support
24762306a36Sopenharmony_ci *
24862306a36Sopenharmony_ci * XGMI is a high speed interconnect that joins multiple GPU cards
24962306a36Sopenharmony_ci * into a homogeneous memory space that is organized by a collective
25062306a36Sopenharmony_ci * hive ID and individual node IDs, both of which are 64-bit numbers.
25162306a36Sopenharmony_ci *
25262306a36Sopenharmony_ci * The file xgmi_device_id contains the unique per GPU device ID and
25362306a36Sopenharmony_ci * is stored in the /sys/class/drm/card${cardno}/device/ directory.
25462306a36Sopenharmony_ci *
25562306a36Sopenharmony_ci * Inside the device directory a sub-directory 'xgmi_hive_info' is
25662306a36Sopenharmony_ci * created which contains the hive ID and the list of nodes.
25762306a36Sopenharmony_ci *
25862306a36Sopenharmony_ci * The hive ID is stored in:
25962306a36Sopenharmony_ci *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id
26062306a36Sopenharmony_ci *
26162306a36Sopenharmony_ci * The node information is stored in numbered directories:
26262306a36Sopenharmony_ci *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id
26362306a36Sopenharmony_ci *
26462306a36Sopenharmony_ci * Each device has their own xgmi_hive_info direction with a mirror
26562306a36Sopenharmony_ci * set of node sub-directories.
26662306a36Sopenharmony_ci *
26762306a36Sopenharmony_ci * The XGMI memory space is built by contiguously adding the power of
26862306a36Sopenharmony_ci * two padded VRAM space from each node to each other.
26962306a36Sopenharmony_ci *
27062306a36Sopenharmony_ci */
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_cistatic struct attribute amdgpu_xgmi_hive_id = {
27362306a36Sopenharmony_ci	.name = "xgmi_hive_id",
27462306a36Sopenharmony_ci	.mode = S_IRUGO
27562306a36Sopenharmony_ci};
27662306a36Sopenharmony_ci
27762306a36Sopenharmony_cistatic struct attribute *amdgpu_xgmi_hive_attrs[] = {
27862306a36Sopenharmony_ci	&amdgpu_xgmi_hive_id,
27962306a36Sopenharmony_ci	NULL
28062306a36Sopenharmony_ci};
28162306a36Sopenharmony_ciATTRIBUTE_GROUPS(amdgpu_xgmi_hive);
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_cistatic ssize_t amdgpu_xgmi_show_attrs(struct kobject *kobj,
28462306a36Sopenharmony_ci	struct attribute *attr, char *buf)
28562306a36Sopenharmony_ci{
28662306a36Sopenharmony_ci	struct amdgpu_hive_info *hive = container_of(
28762306a36Sopenharmony_ci		kobj, struct amdgpu_hive_info, kobj);
28862306a36Sopenharmony_ci
28962306a36Sopenharmony_ci	if (attr == &amdgpu_xgmi_hive_id)
29062306a36Sopenharmony_ci		return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
29162306a36Sopenharmony_ci
29262306a36Sopenharmony_ci	return 0;
29362306a36Sopenharmony_ci}
29462306a36Sopenharmony_ci
29562306a36Sopenharmony_cistatic void amdgpu_xgmi_hive_release(struct kobject *kobj)
29662306a36Sopenharmony_ci{
29762306a36Sopenharmony_ci	struct amdgpu_hive_info *hive = container_of(
29862306a36Sopenharmony_ci		kobj, struct amdgpu_hive_info, kobj);
29962306a36Sopenharmony_ci
30062306a36Sopenharmony_ci	amdgpu_reset_put_reset_domain(hive->reset_domain);
30162306a36Sopenharmony_ci	hive->reset_domain = NULL;
30262306a36Sopenharmony_ci
30362306a36Sopenharmony_ci	mutex_destroy(&hive->hive_lock);
30462306a36Sopenharmony_ci	kfree(hive);
30562306a36Sopenharmony_ci}
30662306a36Sopenharmony_ci
30762306a36Sopenharmony_cistatic const struct sysfs_ops amdgpu_xgmi_hive_ops = {
30862306a36Sopenharmony_ci	.show = amdgpu_xgmi_show_attrs,
30962306a36Sopenharmony_ci};
31062306a36Sopenharmony_ci
31162306a36Sopenharmony_cistatic const struct kobj_type amdgpu_xgmi_hive_type = {
31262306a36Sopenharmony_ci	.release = amdgpu_xgmi_hive_release,
31362306a36Sopenharmony_ci	.sysfs_ops = &amdgpu_xgmi_hive_ops,
31462306a36Sopenharmony_ci	.default_groups = amdgpu_xgmi_hive_groups,
31562306a36Sopenharmony_ci};
31662306a36Sopenharmony_ci
31762306a36Sopenharmony_cistatic ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
31862306a36Sopenharmony_ci				     struct device_attribute *attr,
31962306a36Sopenharmony_ci				     char *buf)
32062306a36Sopenharmony_ci{
32162306a36Sopenharmony_ci	struct drm_device *ddev = dev_get_drvdata(dev);
32262306a36Sopenharmony_ci	struct amdgpu_device *adev = drm_to_adev(ddev);
32362306a36Sopenharmony_ci
32462306a36Sopenharmony_ci	return sysfs_emit(buf, "%llu\n", adev->gmc.xgmi.node_id);
32562306a36Sopenharmony_ci
32662306a36Sopenharmony_ci}
32762306a36Sopenharmony_ci
32862306a36Sopenharmony_cistatic ssize_t amdgpu_xgmi_show_num_hops(struct device *dev,
32962306a36Sopenharmony_ci					struct device_attribute *attr,
33062306a36Sopenharmony_ci					char *buf)
33162306a36Sopenharmony_ci{
33262306a36Sopenharmony_ci	struct drm_device *ddev = dev_get_drvdata(dev);
33362306a36Sopenharmony_ci	struct amdgpu_device *adev = drm_to_adev(ddev);
33462306a36Sopenharmony_ci	struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
33562306a36Sopenharmony_ci	int i;
33662306a36Sopenharmony_ci
33762306a36Sopenharmony_ci	for (i = 0; i < top->num_nodes; i++)
33862306a36Sopenharmony_ci		sprintf(buf + 3 * i, "%02x ", top->nodes[i].num_hops);
33962306a36Sopenharmony_ci
34062306a36Sopenharmony_ci	return sysfs_emit(buf, "%s\n", buf);
34162306a36Sopenharmony_ci}
34262306a36Sopenharmony_ci
34362306a36Sopenharmony_cistatic ssize_t amdgpu_xgmi_show_num_links(struct device *dev,
34462306a36Sopenharmony_ci					struct device_attribute *attr,
34562306a36Sopenharmony_ci					char *buf)
34662306a36Sopenharmony_ci{
34762306a36Sopenharmony_ci	struct drm_device *ddev = dev_get_drvdata(dev);
34862306a36Sopenharmony_ci	struct amdgpu_device *adev = drm_to_adev(ddev);
34962306a36Sopenharmony_ci	struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
35062306a36Sopenharmony_ci	int i;
35162306a36Sopenharmony_ci
35262306a36Sopenharmony_ci	for (i = 0; i < top->num_nodes; i++)
35362306a36Sopenharmony_ci		sprintf(buf + 3 * i, "%02x ", top->nodes[i].num_links);
35462306a36Sopenharmony_ci
35562306a36Sopenharmony_ci	return sysfs_emit(buf, "%s\n", buf);
35662306a36Sopenharmony_ci}
35762306a36Sopenharmony_ci
35862306a36Sopenharmony_ci#define AMDGPU_XGMI_SET_FICAA(o)	((o) | 0x456801)
35962306a36Sopenharmony_cistatic ssize_t amdgpu_xgmi_show_error(struct device *dev,
36062306a36Sopenharmony_ci				      struct device_attribute *attr,
36162306a36Sopenharmony_ci				      char *buf)
36262306a36Sopenharmony_ci{
36362306a36Sopenharmony_ci	struct drm_device *ddev = dev_get_drvdata(dev);
36462306a36Sopenharmony_ci	struct amdgpu_device *adev = drm_to_adev(ddev);
36562306a36Sopenharmony_ci	uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in;
36662306a36Sopenharmony_ci	uint64_t fica_out;
36762306a36Sopenharmony_ci	unsigned int error_count = 0;
36862306a36Sopenharmony_ci
36962306a36Sopenharmony_ci	ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200);
37062306a36Sopenharmony_ci	ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208);
37162306a36Sopenharmony_ci
37262306a36Sopenharmony_ci	if ((!adev->df.funcs) ||
37362306a36Sopenharmony_ci	    (!adev->df.funcs->get_fica) ||
37462306a36Sopenharmony_ci	    (!adev->df.funcs->set_fica))
37562306a36Sopenharmony_ci		return -EINVAL;
37662306a36Sopenharmony_ci
37762306a36Sopenharmony_ci	fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in);
37862306a36Sopenharmony_ci	if (fica_out != 0x1f)
37962306a36Sopenharmony_ci		pr_err("xGMI error counters not enabled!\n");
38062306a36Sopenharmony_ci
38162306a36Sopenharmony_ci	fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in);
38262306a36Sopenharmony_ci
38362306a36Sopenharmony_ci	if ((fica_out & 0xffff) == 2)
38462306a36Sopenharmony_ci		error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63);
38562306a36Sopenharmony_ci
38662306a36Sopenharmony_ci	adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0);
38762306a36Sopenharmony_ci
38862306a36Sopenharmony_ci	return sysfs_emit(buf, "%u\n", error_count);
38962306a36Sopenharmony_ci}
39062306a36Sopenharmony_ci
39162306a36Sopenharmony_ci
39262306a36Sopenharmony_cistatic DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
39362306a36Sopenharmony_cistatic DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL);
39462306a36Sopenharmony_cistatic DEVICE_ATTR(xgmi_num_hops, S_IRUGO, amdgpu_xgmi_show_num_hops, NULL);
39562306a36Sopenharmony_cistatic DEVICE_ATTR(xgmi_num_links, S_IRUGO, amdgpu_xgmi_show_num_links, NULL);
39662306a36Sopenharmony_ci
39762306a36Sopenharmony_cistatic int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
39862306a36Sopenharmony_ci					 struct amdgpu_hive_info *hive)
39962306a36Sopenharmony_ci{
40062306a36Sopenharmony_ci	int ret = 0;
40162306a36Sopenharmony_ci	char node[10] = { 0 };
40262306a36Sopenharmony_ci
40362306a36Sopenharmony_ci	/* Create xgmi device id file */
40462306a36Sopenharmony_ci	ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id);
40562306a36Sopenharmony_ci	if (ret) {
40662306a36Sopenharmony_ci		dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n");
40762306a36Sopenharmony_ci		return ret;
40862306a36Sopenharmony_ci	}
40962306a36Sopenharmony_ci
41062306a36Sopenharmony_ci	/* Create xgmi error file */
41162306a36Sopenharmony_ci	ret = device_create_file(adev->dev, &dev_attr_xgmi_error);
41262306a36Sopenharmony_ci	if (ret)
41362306a36Sopenharmony_ci		pr_err("failed to create xgmi_error\n");
41462306a36Sopenharmony_ci
41562306a36Sopenharmony_ci	/* Create xgmi num hops file */
41662306a36Sopenharmony_ci	ret = device_create_file(adev->dev, &dev_attr_xgmi_num_hops);
41762306a36Sopenharmony_ci	if (ret)
41862306a36Sopenharmony_ci		pr_err("failed to create xgmi_num_hops\n");
41962306a36Sopenharmony_ci
42062306a36Sopenharmony_ci	/* Create xgmi num links file */
42162306a36Sopenharmony_ci	ret = device_create_file(adev->dev, &dev_attr_xgmi_num_links);
42262306a36Sopenharmony_ci	if (ret)
42362306a36Sopenharmony_ci		pr_err("failed to create xgmi_num_links\n");
42462306a36Sopenharmony_ci
42562306a36Sopenharmony_ci	/* Create sysfs link to hive info folder on the first device */
42662306a36Sopenharmony_ci	if (hive->kobj.parent != (&adev->dev->kobj)) {
42762306a36Sopenharmony_ci		ret = sysfs_create_link(&adev->dev->kobj, &hive->kobj,
42862306a36Sopenharmony_ci					"xgmi_hive_info");
42962306a36Sopenharmony_ci		if (ret) {
43062306a36Sopenharmony_ci			dev_err(adev->dev, "XGMI: Failed to create link to hive info");
43162306a36Sopenharmony_ci			goto remove_file;
43262306a36Sopenharmony_ci		}
43362306a36Sopenharmony_ci	}
43462306a36Sopenharmony_ci
43562306a36Sopenharmony_ci	sprintf(node, "node%d", atomic_read(&hive->number_devices));
43662306a36Sopenharmony_ci	/* Create sysfs link form the hive folder to yourself */
43762306a36Sopenharmony_ci	ret = sysfs_create_link(&hive->kobj, &adev->dev->kobj, node);
43862306a36Sopenharmony_ci	if (ret) {
43962306a36Sopenharmony_ci		dev_err(adev->dev, "XGMI: Failed to create link from hive info");
44062306a36Sopenharmony_ci		goto remove_link;
44162306a36Sopenharmony_ci	}
44262306a36Sopenharmony_ci
44362306a36Sopenharmony_ci	goto success;
44462306a36Sopenharmony_ci
44562306a36Sopenharmony_ci
44662306a36Sopenharmony_ciremove_link:
44762306a36Sopenharmony_ci	sysfs_remove_link(&adev->dev->kobj, adev_to_drm(adev)->unique);
44862306a36Sopenharmony_ci
44962306a36Sopenharmony_ciremove_file:
45062306a36Sopenharmony_ci	device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
45162306a36Sopenharmony_ci	device_remove_file(adev->dev, &dev_attr_xgmi_error);
45262306a36Sopenharmony_ci	device_remove_file(adev->dev, &dev_attr_xgmi_num_hops);
45362306a36Sopenharmony_ci	device_remove_file(adev->dev, &dev_attr_xgmi_num_links);
45462306a36Sopenharmony_ci
45562306a36Sopenharmony_cisuccess:
45662306a36Sopenharmony_ci	return ret;
45762306a36Sopenharmony_ci}
45862306a36Sopenharmony_ci
45962306a36Sopenharmony_cistatic void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
46062306a36Sopenharmony_ci					  struct amdgpu_hive_info *hive)
46162306a36Sopenharmony_ci{
46262306a36Sopenharmony_ci	char node[10];
46362306a36Sopenharmony_ci	memset(node, 0, sizeof(node));
46462306a36Sopenharmony_ci
46562306a36Sopenharmony_ci	device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
46662306a36Sopenharmony_ci	device_remove_file(adev->dev, &dev_attr_xgmi_error);
46762306a36Sopenharmony_ci	device_remove_file(adev->dev, &dev_attr_xgmi_num_hops);
46862306a36Sopenharmony_ci	device_remove_file(adev->dev, &dev_attr_xgmi_num_links);
46962306a36Sopenharmony_ci
47062306a36Sopenharmony_ci	if (hive->kobj.parent != (&adev->dev->kobj))
47162306a36Sopenharmony_ci		sysfs_remove_link(&adev->dev->kobj,"xgmi_hive_info");
47262306a36Sopenharmony_ci
47362306a36Sopenharmony_ci	sprintf(node, "node%d", atomic_read(&hive->number_devices));
47462306a36Sopenharmony_ci	sysfs_remove_link(&hive->kobj, node);
47562306a36Sopenharmony_ci
47662306a36Sopenharmony_ci}
47762306a36Sopenharmony_ci
47862306a36Sopenharmony_ci
47962306a36Sopenharmony_ci
48062306a36Sopenharmony_cistruct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
48162306a36Sopenharmony_ci{
48262306a36Sopenharmony_ci	struct amdgpu_hive_info *hive = NULL;
48362306a36Sopenharmony_ci	int ret;
48462306a36Sopenharmony_ci
48562306a36Sopenharmony_ci	if (!adev->gmc.xgmi.hive_id)
48662306a36Sopenharmony_ci		return NULL;
48762306a36Sopenharmony_ci
48862306a36Sopenharmony_ci	if (adev->hive) {
48962306a36Sopenharmony_ci		kobject_get(&adev->hive->kobj);
49062306a36Sopenharmony_ci		return adev->hive;
49162306a36Sopenharmony_ci	}
49262306a36Sopenharmony_ci
49362306a36Sopenharmony_ci	mutex_lock(&xgmi_mutex);
49462306a36Sopenharmony_ci
49562306a36Sopenharmony_ci	list_for_each_entry(hive, &xgmi_hive_list, node)  {
49662306a36Sopenharmony_ci		if (hive->hive_id == adev->gmc.xgmi.hive_id)
49762306a36Sopenharmony_ci			goto pro_end;
49862306a36Sopenharmony_ci	}
49962306a36Sopenharmony_ci
50062306a36Sopenharmony_ci	hive = kzalloc(sizeof(*hive), GFP_KERNEL);
50162306a36Sopenharmony_ci	if (!hive) {
50262306a36Sopenharmony_ci		dev_err(adev->dev, "XGMI: allocation failed\n");
50362306a36Sopenharmony_ci		ret = -ENOMEM;
50462306a36Sopenharmony_ci		hive = NULL;
50562306a36Sopenharmony_ci		goto pro_end;
50662306a36Sopenharmony_ci	}
50762306a36Sopenharmony_ci
50862306a36Sopenharmony_ci	/* initialize new hive if not exist */
50962306a36Sopenharmony_ci	ret = kobject_init_and_add(&hive->kobj,
51062306a36Sopenharmony_ci			&amdgpu_xgmi_hive_type,
51162306a36Sopenharmony_ci			&adev->dev->kobj,
51262306a36Sopenharmony_ci			"%s", "xgmi_hive_info");
51362306a36Sopenharmony_ci	if (ret) {
51462306a36Sopenharmony_ci		dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi hive\n");
51562306a36Sopenharmony_ci		kobject_put(&hive->kobj);
51662306a36Sopenharmony_ci		hive = NULL;
51762306a36Sopenharmony_ci		goto pro_end;
51862306a36Sopenharmony_ci	}
51962306a36Sopenharmony_ci
52062306a36Sopenharmony_ci	/**
52162306a36Sopenharmony_ci	 * Only init hive->reset_domain for none SRIOV configuration. For SRIOV,
52262306a36Sopenharmony_ci	 * Host driver decide how to reset the GPU either through FLR or chain reset.
52362306a36Sopenharmony_ci	 * Guest side will get individual notifications from the host for the FLR
52462306a36Sopenharmony_ci	 * if necessary.
52562306a36Sopenharmony_ci	 */
52662306a36Sopenharmony_ci	if (!amdgpu_sriov_vf(adev)) {
52762306a36Sopenharmony_ci	/**
52862306a36Sopenharmony_ci	 * Avoid recreating reset domain when hive is reconstructed for the case
52962306a36Sopenharmony_ci	 * of reset the devices in the XGMI hive during probe for passthrough GPU
53062306a36Sopenharmony_ci	 * See https://www.spinics.net/lists/amd-gfx/msg58836.html
53162306a36Sopenharmony_ci	 */
53262306a36Sopenharmony_ci		if (adev->reset_domain->type != XGMI_HIVE) {
53362306a36Sopenharmony_ci			hive->reset_domain =
53462306a36Sopenharmony_ci				amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
53562306a36Sopenharmony_ci			if (!hive->reset_domain) {
53662306a36Sopenharmony_ci				dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n");
53762306a36Sopenharmony_ci				ret = -ENOMEM;
53862306a36Sopenharmony_ci				kobject_put(&hive->kobj);
53962306a36Sopenharmony_ci				hive = NULL;
54062306a36Sopenharmony_ci				goto pro_end;
54162306a36Sopenharmony_ci			}
54262306a36Sopenharmony_ci		} else {
54362306a36Sopenharmony_ci			amdgpu_reset_get_reset_domain(adev->reset_domain);
54462306a36Sopenharmony_ci			hive->reset_domain = adev->reset_domain;
54562306a36Sopenharmony_ci		}
54662306a36Sopenharmony_ci	}
54762306a36Sopenharmony_ci
54862306a36Sopenharmony_ci	hive->hive_id = adev->gmc.xgmi.hive_id;
54962306a36Sopenharmony_ci	INIT_LIST_HEAD(&hive->device_list);
55062306a36Sopenharmony_ci	INIT_LIST_HEAD(&hive->node);
55162306a36Sopenharmony_ci	mutex_init(&hive->hive_lock);
55262306a36Sopenharmony_ci	atomic_set(&hive->number_devices, 0);
55362306a36Sopenharmony_ci	task_barrier_init(&hive->tb);
55462306a36Sopenharmony_ci	hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
55562306a36Sopenharmony_ci	hive->hi_req_gpu = NULL;
55662306a36Sopenharmony_ci
55762306a36Sopenharmony_ci	/*
55862306a36Sopenharmony_ci	 * hive pstate on boot is high in vega20 so we have to go to low
55962306a36Sopenharmony_ci	 * pstate on after boot.
56062306a36Sopenharmony_ci	 */
56162306a36Sopenharmony_ci	hive->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE;
56262306a36Sopenharmony_ci	list_add_tail(&hive->node, &xgmi_hive_list);
56362306a36Sopenharmony_ci
56462306a36Sopenharmony_cipro_end:
56562306a36Sopenharmony_ci	if (hive)
56662306a36Sopenharmony_ci		kobject_get(&hive->kobj);
56762306a36Sopenharmony_ci	mutex_unlock(&xgmi_mutex);
56862306a36Sopenharmony_ci	return hive;
56962306a36Sopenharmony_ci}
57062306a36Sopenharmony_ci
57162306a36Sopenharmony_civoid amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive)
57262306a36Sopenharmony_ci{
57362306a36Sopenharmony_ci	if (hive)
57462306a36Sopenharmony_ci		kobject_put(&hive->kobj);
57562306a36Sopenharmony_ci}
57662306a36Sopenharmony_ci
57762306a36Sopenharmony_ciint amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
57862306a36Sopenharmony_ci{
57962306a36Sopenharmony_ci	int ret = 0;
58062306a36Sopenharmony_ci	struct amdgpu_hive_info *hive;
58162306a36Sopenharmony_ci	struct amdgpu_device *request_adev;
58262306a36Sopenharmony_ci	bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
58362306a36Sopenharmony_ci	bool init_low;
58462306a36Sopenharmony_ci
58562306a36Sopenharmony_ci	hive = amdgpu_get_xgmi_hive(adev);
58662306a36Sopenharmony_ci	if (!hive)
58762306a36Sopenharmony_ci		return 0;
58862306a36Sopenharmony_ci
58962306a36Sopenharmony_ci	request_adev = hive->hi_req_gpu ? hive->hi_req_gpu : adev;
59062306a36Sopenharmony_ci	init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN;
59162306a36Sopenharmony_ci	amdgpu_put_xgmi_hive(hive);
59262306a36Sopenharmony_ci	/* fw bug so temporarily disable pstate switching */
59362306a36Sopenharmony_ci	return 0;
59462306a36Sopenharmony_ci
59562306a36Sopenharmony_ci	if (!hive || adev->asic_type != CHIP_VEGA20)
59662306a36Sopenharmony_ci		return 0;
59762306a36Sopenharmony_ci
59862306a36Sopenharmony_ci	mutex_lock(&hive->hive_lock);
59962306a36Sopenharmony_ci
60062306a36Sopenharmony_ci	if (is_hi_req)
60162306a36Sopenharmony_ci		hive->hi_req_count++;
60262306a36Sopenharmony_ci	else
60362306a36Sopenharmony_ci		hive->hi_req_count--;
60462306a36Sopenharmony_ci
60562306a36Sopenharmony_ci	/*
60662306a36Sopenharmony_ci	 * Vega20 only needs single peer to request pstate high for the hive to
60762306a36Sopenharmony_ci	 * go high but all peers must request pstate low for the hive to go low
60862306a36Sopenharmony_ci	 */
60962306a36Sopenharmony_ci	if (hive->pstate == pstate ||
61062306a36Sopenharmony_ci			(!is_hi_req && hive->hi_req_count && !init_low))
61162306a36Sopenharmony_ci		goto out;
61262306a36Sopenharmony_ci
61362306a36Sopenharmony_ci	dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate);
61462306a36Sopenharmony_ci
61562306a36Sopenharmony_ci	ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate);
61662306a36Sopenharmony_ci	if (ret) {
61762306a36Sopenharmony_ci		dev_err(request_adev->dev,
61862306a36Sopenharmony_ci			"XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
61962306a36Sopenharmony_ci			request_adev->gmc.xgmi.node_id,
62062306a36Sopenharmony_ci			request_adev->gmc.xgmi.hive_id, ret);
62162306a36Sopenharmony_ci		goto out;
62262306a36Sopenharmony_ci	}
62362306a36Sopenharmony_ci
62462306a36Sopenharmony_ci	if (init_low)
62562306a36Sopenharmony_ci		hive->pstate = hive->hi_req_count ?
62662306a36Sopenharmony_ci					hive->pstate : AMDGPU_XGMI_PSTATE_MIN;
62762306a36Sopenharmony_ci	else {
62862306a36Sopenharmony_ci		hive->pstate = pstate;
62962306a36Sopenharmony_ci		hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ?
63062306a36Sopenharmony_ci							adev : NULL;
63162306a36Sopenharmony_ci	}
63262306a36Sopenharmony_ciout:
63362306a36Sopenharmony_ci	mutex_unlock(&hive->hive_lock);
63462306a36Sopenharmony_ci	return ret;
63562306a36Sopenharmony_ci}
63662306a36Sopenharmony_ci
63762306a36Sopenharmony_ciint amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
63862306a36Sopenharmony_ci{
63962306a36Sopenharmony_ci	int ret;
64062306a36Sopenharmony_ci
64162306a36Sopenharmony_ci	if (amdgpu_sriov_vf(adev))
64262306a36Sopenharmony_ci		return 0;
64362306a36Sopenharmony_ci
64462306a36Sopenharmony_ci	/* Each psp need to set the latest topology */
64562306a36Sopenharmony_ci	ret = psp_xgmi_set_topology_info(&adev->psp,
64662306a36Sopenharmony_ci					 atomic_read(&hive->number_devices),
64762306a36Sopenharmony_ci					 &adev->psp.xgmi_context.top_info);
64862306a36Sopenharmony_ci	if (ret)
64962306a36Sopenharmony_ci		dev_err(adev->dev,
65062306a36Sopenharmony_ci			"XGMI: Set topology failure on device %llx, hive %llx, ret %d",
65162306a36Sopenharmony_ci			adev->gmc.xgmi.node_id,
65262306a36Sopenharmony_ci			adev->gmc.xgmi.hive_id, ret);
65362306a36Sopenharmony_ci
65462306a36Sopenharmony_ci	return ret;
65562306a36Sopenharmony_ci}
65662306a36Sopenharmony_ci
65762306a36Sopenharmony_ci
65862306a36Sopenharmony_ci/*
65962306a36Sopenharmony_ci * NOTE psp_xgmi_node_info.num_hops layout is as follows:
66062306a36Sopenharmony_ci * num_hops[7:6] = link type (0 = xGMI2, 1 = xGMI3, 2/3 = reserved)
66162306a36Sopenharmony_ci * num_hops[5:3] = reserved
66262306a36Sopenharmony_ci * num_hops[2:0] = number of hops
66362306a36Sopenharmony_ci */
66462306a36Sopenharmony_ciint amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
66562306a36Sopenharmony_ci		struct amdgpu_device *peer_adev)
66662306a36Sopenharmony_ci{
66762306a36Sopenharmony_ci	struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
66862306a36Sopenharmony_ci	uint8_t num_hops_mask = 0x7;
66962306a36Sopenharmony_ci	int i;
67062306a36Sopenharmony_ci
67162306a36Sopenharmony_ci	for (i = 0 ; i < top->num_nodes; ++i)
67262306a36Sopenharmony_ci		if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
67362306a36Sopenharmony_ci			return top->nodes[i].num_hops & num_hops_mask;
67462306a36Sopenharmony_ci	return	-EINVAL;
67562306a36Sopenharmony_ci}
67662306a36Sopenharmony_ci
67762306a36Sopenharmony_ciint amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
67862306a36Sopenharmony_ci		struct amdgpu_device *peer_adev)
67962306a36Sopenharmony_ci{
68062306a36Sopenharmony_ci	struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
68162306a36Sopenharmony_ci	int i;
68262306a36Sopenharmony_ci
68362306a36Sopenharmony_ci	for (i = 0 ; i < top->num_nodes; ++i)
68462306a36Sopenharmony_ci		if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
68562306a36Sopenharmony_ci			return top->nodes[i].num_links;
68662306a36Sopenharmony_ci	return	-EINVAL;
68762306a36Sopenharmony_ci}
68862306a36Sopenharmony_ci
68962306a36Sopenharmony_ci/*
69062306a36Sopenharmony_ci * Devices that support extended data require the entire hive to initialize with
69162306a36Sopenharmony_ci * the shared memory buffer flag set.
69262306a36Sopenharmony_ci *
69362306a36Sopenharmony_ci * Hive locks and conditions apply - see amdgpu_xgmi_add_device
69462306a36Sopenharmony_ci */
69562306a36Sopenharmony_cistatic int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_info *hive,
69662306a36Sopenharmony_ci							bool set_extended_data)
69762306a36Sopenharmony_ci{
69862306a36Sopenharmony_ci	struct amdgpu_device *tmp_adev;
69962306a36Sopenharmony_ci	int ret;
70062306a36Sopenharmony_ci
70162306a36Sopenharmony_ci	list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
70262306a36Sopenharmony_ci		ret = psp_xgmi_initialize(&tmp_adev->psp, set_extended_data, false);
70362306a36Sopenharmony_ci		if (ret) {
70462306a36Sopenharmony_ci			dev_err(tmp_adev->dev,
70562306a36Sopenharmony_ci				"XGMI: Failed to initialize xgmi session for data partition %i\n",
70662306a36Sopenharmony_ci				set_extended_data);
70762306a36Sopenharmony_ci			return ret;
70862306a36Sopenharmony_ci		}
70962306a36Sopenharmony_ci
71062306a36Sopenharmony_ci	}
71162306a36Sopenharmony_ci
71262306a36Sopenharmony_ci	return 0;
71362306a36Sopenharmony_ci}
71462306a36Sopenharmony_ci
71562306a36Sopenharmony_ciint amdgpu_xgmi_add_device(struct amdgpu_device *adev)
71662306a36Sopenharmony_ci{
71762306a36Sopenharmony_ci	struct psp_xgmi_topology_info *top_info;
71862306a36Sopenharmony_ci	struct amdgpu_hive_info *hive;
71962306a36Sopenharmony_ci	struct amdgpu_xgmi	*entry;
72062306a36Sopenharmony_ci	struct amdgpu_device *tmp_adev = NULL;
72162306a36Sopenharmony_ci
72262306a36Sopenharmony_ci	int count = 0, ret = 0;
72362306a36Sopenharmony_ci
72462306a36Sopenharmony_ci	if (!adev->gmc.xgmi.supported)
72562306a36Sopenharmony_ci		return 0;
72662306a36Sopenharmony_ci
72762306a36Sopenharmony_ci	if (!adev->gmc.xgmi.pending_reset &&
72862306a36Sopenharmony_ci	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
72962306a36Sopenharmony_ci		ret = psp_xgmi_initialize(&adev->psp, false, true);
73062306a36Sopenharmony_ci		if (ret) {
73162306a36Sopenharmony_ci			dev_err(adev->dev,
73262306a36Sopenharmony_ci				"XGMI: Failed to initialize xgmi session\n");
73362306a36Sopenharmony_ci			return ret;
73462306a36Sopenharmony_ci		}
73562306a36Sopenharmony_ci
73662306a36Sopenharmony_ci		ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id);
73762306a36Sopenharmony_ci		if (ret) {
73862306a36Sopenharmony_ci			dev_err(adev->dev,
73962306a36Sopenharmony_ci				"XGMI: Failed to get hive id\n");
74062306a36Sopenharmony_ci			return ret;
74162306a36Sopenharmony_ci		}
74262306a36Sopenharmony_ci
74362306a36Sopenharmony_ci		ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id);
74462306a36Sopenharmony_ci		if (ret) {
74562306a36Sopenharmony_ci			dev_err(adev->dev,
74662306a36Sopenharmony_ci				"XGMI: Failed to get node id\n");
74762306a36Sopenharmony_ci			return ret;
74862306a36Sopenharmony_ci		}
74962306a36Sopenharmony_ci	} else {
75062306a36Sopenharmony_ci		adev->gmc.xgmi.hive_id = 16;
75162306a36Sopenharmony_ci		adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
75262306a36Sopenharmony_ci	}
75362306a36Sopenharmony_ci
75462306a36Sopenharmony_ci	hive = amdgpu_get_xgmi_hive(adev);
75562306a36Sopenharmony_ci	if (!hive) {
75662306a36Sopenharmony_ci		ret = -EINVAL;
75762306a36Sopenharmony_ci		dev_err(adev->dev,
75862306a36Sopenharmony_ci			"XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",
75962306a36Sopenharmony_ci			adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id);
76062306a36Sopenharmony_ci		goto exit;
76162306a36Sopenharmony_ci	}
76262306a36Sopenharmony_ci	mutex_lock(&hive->hive_lock);
76362306a36Sopenharmony_ci
76462306a36Sopenharmony_ci	top_info = &adev->psp.xgmi_context.top_info;
76562306a36Sopenharmony_ci
76662306a36Sopenharmony_ci	list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
76762306a36Sopenharmony_ci	list_for_each_entry(entry, &hive->device_list, head)
76862306a36Sopenharmony_ci		top_info->nodes[count++].node_id = entry->node_id;
76962306a36Sopenharmony_ci	top_info->num_nodes = count;
77062306a36Sopenharmony_ci	atomic_set(&hive->number_devices, count);
77162306a36Sopenharmony_ci
77262306a36Sopenharmony_ci	task_barrier_add_task(&hive->tb);
77362306a36Sopenharmony_ci
77462306a36Sopenharmony_ci	if (!adev->gmc.xgmi.pending_reset &&
77562306a36Sopenharmony_ci	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
77662306a36Sopenharmony_ci		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
77762306a36Sopenharmony_ci			/* update node list for other device in the hive */
77862306a36Sopenharmony_ci			if (tmp_adev != adev) {
77962306a36Sopenharmony_ci				top_info = &tmp_adev->psp.xgmi_context.top_info;
78062306a36Sopenharmony_ci				top_info->nodes[count - 1].node_id =
78162306a36Sopenharmony_ci					adev->gmc.xgmi.node_id;
78262306a36Sopenharmony_ci				top_info->num_nodes = count;
78362306a36Sopenharmony_ci			}
78462306a36Sopenharmony_ci			ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
78562306a36Sopenharmony_ci			if (ret)
78662306a36Sopenharmony_ci				goto exit_unlock;
78762306a36Sopenharmony_ci		}
78862306a36Sopenharmony_ci
78962306a36Sopenharmony_ci		/* get latest topology info for each device from psp */
79062306a36Sopenharmony_ci		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
79162306a36Sopenharmony_ci			ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
79262306a36Sopenharmony_ci					&tmp_adev->psp.xgmi_context.top_info, false);
79362306a36Sopenharmony_ci			if (ret) {
79462306a36Sopenharmony_ci				dev_err(tmp_adev->dev,
79562306a36Sopenharmony_ci					"XGMI: Get topology failure on device %llx, hive %llx, ret %d",
79662306a36Sopenharmony_ci					tmp_adev->gmc.xgmi.node_id,
79762306a36Sopenharmony_ci					tmp_adev->gmc.xgmi.hive_id, ret);
79862306a36Sopenharmony_ci				/* To do : continue with some node failed or disable the whole hive */
79962306a36Sopenharmony_ci				goto exit_unlock;
80062306a36Sopenharmony_ci			}
80162306a36Sopenharmony_ci		}
80262306a36Sopenharmony_ci
80362306a36Sopenharmony_ci		/* get topology again for hives that support extended data */
80462306a36Sopenharmony_ci		if (adev->psp.xgmi_context.supports_extended_data) {
80562306a36Sopenharmony_ci
80662306a36Sopenharmony_ci			/* initialize the hive to get extended data.  */
80762306a36Sopenharmony_ci			ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, true);
80862306a36Sopenharmony_ci			if (ret)
80962306a36Sopenharmony_ci				goto exit_unlock;
81062306a36Sopenharmony_ci
81162306a36Sopenharmony_ci			/* get the extended data. */
81262306a36Sopenharmony_ci			list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
81362306a36Sopenharmony_ci				ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
81462306a36Sopenharmony_ci						&tmp_adev->psp.xgmi_context.top_info, true);
81562306a36Sopenharmony_ci				if (ret) {
81662306a36Sopenharmony_ci					dev_err(tmp_adev->dev,
81762306a36Sopenharmony_ci						"XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d",
81862306a36Sopenharmony_ci						tmp_adev->gmc.xgmi.node_id,
81962306a36Sopenharmony_ci						tmp_adev->gmc.xgmi.hive_id, ret);
82062306a36Sopenharmony_ci					goto exit_unlock;
82162306a36Sopenharmony_ci				}
82262306a36Sopenharmony_ci			}
82362306a36Sopenharmony_ci
82462306a36Sopenharmony_ci			/* initialize the hive to get non-extended data for the next round. */
82562306a36Sopenharmony_ci			ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, false);
82662306a36Sopenharmony_ci			if (ret)
82762306a36Sopenharmony_ci				goto exit_unlock;
82862306a36Sopenharmony_ci
82962306a36Sopenharmony_ci		}
83062306a36Sopenharmony_ci	}
83162306a36Sopenharmony_ci
83262306a36Sopenharmony_ci	if (!ret && !adev->gmc.xgmi.pending_reset)
83362306a36Sopenharmony_ci		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
83462306a36Sopenharmony_ci
83562306a36Sopenharmony_ciexit_unlock:
83662306a36Sopenharmony_ci	mutex_unlock(&hive->hive_lock);
83762306a36Sopenharmony_ciexit:
83862306a36Sopenharmony_ci	if (!ret) {
83962306a36Sopenharmony_ci		adev->hive = hive;
84062306a36Sopenharmony_ci		dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
84162306a36Sopenharmony_ci			 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id);
84262306a36Sopenharmony_ci	} else {
84362306a36Sopenharmony_ci		amdgpu_put_xgmi_hive(hive);
84462306a36Sopenharmony_ci		dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",
84562306a36Sopenharmony_ci			adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,
84662306a36Sopenharmony_ci			ret);
84762306a36Sopenharmony_ci	}
84862306a36Sopenharmony_ci
84962306a36Sopenharmony_ci	return ret;
85062306a36Sopenharmony_ci}
85162306a36Sopenharmony_ci
85262306a36Sopenharmony_ciint amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
85362306a36Sopenharmony_ci{
85462306a36Sopenharmony_ci	struct amdgpu_hive_info *hive = adev->hive;
85562306a36Sopenharmony_ci
85662306a36Sopenharmony_ci	if (!adev->gmc.xgmi.supported)
85762306a36Sopenharmony_ci		return -EINVAL;
85862306a36Sopenharmony_ci
85962306a36Sopenharmony_ci	if (!hive)
86062306a36Sopenharmony_ci		return -EINVAL;
86162306a36Sopenharmony_ci
86262306a36Sopenharmony_ci	mutex_lock(&hive->hive_lock);
86362306a36Sopenharmony_ci	task_barrier_rem_task(&hive->tb);
86462306a36Sopenharmony_ci	amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
86562306a36Sopenharmony_ci	if (hive->hi_req_gpu == adev)
86662306a36Sopenharmony_ci		hive->hi_req_gpu = NULL;
86762306a36Sopenharmony_ci	list_del(&adev->gmc.xgmi.head);
86862306a36Sopenharmony_ci	mutex_unlock(&hive->hive_lock);
86962306a36Sopenharmony_ci
87062306a36Sopenharmony_ci	amdgpu_put_xgmi_hive(hive);
87162306a36Sopenharmony_ci	adev->hive = NULL;
87262306a36Sopenharmony_ci
87362306a36Sopenharmony_ci	if (atomic_dec_return(&hive->number_devices) == 0) {
87462306a36Sopenharmony_ci		/* Remove the hive from global hive list */
87562306a36Sopenharmony_ci		mutex_lock(&xgmi_mutex);
87662306a36Sopenharmony_ci		list_del(&hive->node);
87762306a36Sopenharmony_ci		mutex_unlock(&xgmi_mutex);
87862306a36Sopenharmony_ci
87962306a36Sopenharmony_ci		amdgpu_put_xgmi_hive(hive);
88062306a36Sopenharmony_ci	}
88162306a36Sopenharmony_ci
88262306a36Sopenharmony_ci	return 0;
88362306a36Sopenharmony_ci}
88462306a36Sopenharmony_ci
88562306a36Sopenharmony_cistatic int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
88662306a36Sopenharmony_ci{
88762306a36Sopenharmony_ci	if (!adev->gmc.xgmi.supported ||
88862306a36Sopenharmony_ci	    adev->gmc.xgmi.num_physical_nodes == 0)
88962306a36Sopenharmony_ci		return 0;
89062306a36Sopenharmony_ci
89162306a36Sopenharmony_ci	adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev);
89262306a36Sopenharmony_ci
89362306a36Sopenharmony_ci	return amdgpu_ras_block_late_init(adev, ras_block);
89462306a36Sopenharmony_ci}
89562306a36Sopenharmony_ci
89662306a36Sopenharmony_ciuint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
89762306a36Sopenharmony_ci					   uint64_t addr)
89862306a36Sopenharmony_ci{
89962306a36Sopenharmony_ci	struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi;
90062306a36Sopenharmony_ci	return (addr + xgmi->physical_node_id * xgmi->node_segment_size);
90162306a36Sopenharmony_ci}
90262306a36Sopenharmony_ci
90362306a36Sopenharmony_cistatic void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg)
90462306a36Sopenharmony_ci{
90562306a36Sopenharmony_ci	WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF);
90662306a36Sopenharmony_ci	WREG32_PCIE(pcs_status_reg, 0);
90762306a36Sopenharmony_ci}
90862306a36Sopenharmony_ci
90962306a36Sopenharmony_cistatic void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
91062306a36Sopenharmony_ci{
91162306a36Sopenharmony_ci	uint32_t i;
91262306a36Sopenharmony_ci
91362306a36Sopenharmony_ci	switch (adev->asic_type) {
91462306a36Sopenharmony_ci	case CHIP_ARCTURUS:
91562306a36Sopenharmony_ci		for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++)
91662306a36Sopenharmony_ci			pcs_clear_status(adev,
91762306a36Sopenharmony_ci					 xgmi_pcs_err_status_reg_arct[i]);
91862306a36Sopenharmony_ci		break;
91962306a36Sopenharmony_ci	case CHIP_VEGA20:
92062306a36Sopenharmony_ci		for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++)
92162306a36Sopenharmony_ci			pcs_clear_status(adev,
92262306a36Sopenharmony_ci					 xgmi_pcs_err_status_reg_vg20[i]);
92362306a36Sopenharmony_ci		break;
92462306a36Sopenharmony_ci	case CHIP_ALDEBARAN:
92562306a36Sopenharmony_ci		for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++)
92662306a36Sopenharmony_ci			pcs_clear_status(adev,
92762306a36Sopenharmony_ci					 xgmi3x16_pcs_err_status_reg_aldebaran[i]);
92862306a36Sopenharmony_ci		for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++)
92962306a36Sopenharmony_ci			pcs_clear_status(adev,
93062306a36Sopenharmony_ci					 walf_pcs_err_status_reg_aldebaran[i]);
93162306a36Sopenharmony_ci		break;
93262306a36Sopenharmony_ci	default:
93362306a36Sopenharmony_ci		break;
93462306a36Sopenharmony_ci	}
93562306a36Sopenharmony_ci}
93662306a36Sopenharmony_ci
93762306a36Sopenharmony_cistatic int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
93862306a36Sopenharmony_ci					      uint32_t value,
93962306a36Sopenharmony_ci						  uint32_t mask_value,
94062306a36Sopenharmony_ci					      uint32_t *ue_count,
94162306a36Sopenharmony_ci					      uint32_t *ce_count,
94262306a36Sopenharmony_ci					      bool is_xgmi_pcs,
94362306a36Sopenharmony_ci						  bool check_mask)
94462306a36Sopenharmony_ci{
94562306a36Sopenharmony_ci	int i;
94662306a36Sopenharmony_ci	int ue_cnt = 0;
94762306a36Sopenharmony_ci	const struct amdgpu_pcs_ras_field *pcs_ras_fields = NULL;
94862306a36Sopenharmony_ci	uint32_t field_array_size = 0;
94962306a36Sopenharmony_ci
95062306a36Sopenharmony_ci	if (is_xgmi_pcs) {
95162306a36Sopenharmony_ci		if (adev->ip_versions[XGMI_HWIP][0] == IP_VERSION(6, 1, 0)) {
95262306a36Sopenharmony_ci			pcs_ras_fields = &xgmi3x16_pcs_ras_fields[0];
95362306a36Sopenharmony_ci			field_array_size = ARRAY_SIZE(xgmi3x16_pcs_ras_fields);
95462306a36Sopenharmony_ci		} else {
95562306a36Sopenharmony_ci			pcs_ras_fields = &xgmi_pcs_ras_fields[0];
95662306a36Sopenharmony_ci			field_array_size = ARRAY_SIZE(xgmi_pcs_ras_fields);
95762306a36Sopenharmony_ci		}
95862306a36Sopenharmony_ci	} else {
95962306a36Sopenharmony_ci		pcs_ras_fields = &wafl_pcs_ras_fields[0];
96062306a36Sopenharmony_ci		field_array_size = ARRAY_SIZE(wafl_pcs_ras_fields);
96162306a36Sopenharmony_ci	}
96262306a36Sopenharmony_ci
96362306a36Sopenharmony_ci	if (check_mask)
96462306a36Sopenharmony_ci		value = value & ~mask_value;
96562306a36Sopenharmony_ci
96662306a36Sopenharmony_ci	/* query xgmi/walf pcs error status,
96762306a36Sopenharmony_ci	 * only ue is supported */
96862306a36Sopenharmony_ci	for (i = 0; value && i < field_array_size; i++) {
96962306a36Sopenharmony_ci		ue_cnt = (value &
97062306a36Sopenharmony_ci				pcs_ras_fields[i].pcs_err_mask) >>
97162306a36Sopenharmony_ci				pcs_ras_fields[i].pcs_err_shift;
97262306a36Sopenharmony_ci		if (ue_cnt) {
97362306a36Sopenharmony_ci			dev_info(adev->dev, "%s detected\n",
97462306a36Sopenharmony_ci				 pcs_ras_fields[i].err_name);
97562306a36Sopenharmony_ci			*ue_count += ue_cnt;
97662306a36Sopenharmony_ci		}
97762306a36Sopenharmony_ci
97862306a36Sopenharmony_ci		/* reset bit value if the bit is checked */
97962306a36Sopenharmony_ci		value &= ~(pcs_ras_fields[i].pcs_err_mask);
98062306a36Sopenharmony_ci	}
98162306a36Sopenharmony_ci
98262306a36Sopenharmony_ci	return 0;
98362306a36Sopenharmony_ci}
98462306a36Sopenharmony_ci
98562306a36Sopenharmony_cistatic void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
98662306a36Sopenharmony_ci					     void *ras_error_status)
98762306a36Sopenharmony_ci{
98862306a36Sopenharmony_ci	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
98962306a36Sopenharmony_ci	int i;
99062306a36Sopenharmony_ci	uint32_t data, mask_data = 0;
99162306a36Sopenharmony_ci	uint32_t ue_cnt = 0, ce_cnt = 0;
99262306a36Sopenharmony_ci
99362306a36Sopenharmony_ci	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL))
99462306a36Sopenharmony_ci		return ;
99562306a36Sopenharmony_ci
99662306a36Sopenharmony_ci	err_data->ue_count = 0;
99762306a36Sopenharmony_ci	err_data->ce_count = 0;
99862306a36Sopenharmony_ci
99962306a36Sopenharmony_ci	switch (adev->asic_type) {
100062306a36Sopenharmony_ci	case CHIP_ARCTURUS:
100162306a36Sopenharmony_ci		/* check xgmi pcs error */
100262306a36Sopenharmony_ci		for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) {
100362306a36Sopenharmony_ci			data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]);
100462306a36Sopenharmony_ci			if (data)
100562306a36Sopenharmony_ci				amdgpu_xgmi_query_pcs_error_status(adev, data,
100662306a36Sopenharmony_ci						mask_data, &ue_cnt, &ce_cnt, true, false);
100762306a36Sopenharmony_ci		}
100862306a36Sopenharmony_ci		/* check wafl pcs error */
100962306a36Sopenharmony_ci		for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) {
101062306a36Sopenharmony_ci			data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]);
101162306a36Sopenharmony_ci			if (data)
101262306a36Sopenharmony_ci				amdgpu_xgmi_query_pcs_error_status(adev, data,
101362306a36Sopenharmony_ci						mask_data, &ue_cnt, &ce_cnt, false, false);
101462306a36Sopenharmony_ci		}
101562306a36Sopenharmony_ci		break;
101662306a36Sopenharmony_ci	case CHIP_VEGA20:
101762306a36Sopenharmony_ci		/* check xgmi pcs error */
101862306a36Sopenharmony_ci		for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) {
101962306a36Sopenharmony_ci			data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]);
102062306a36Sopenharmony_ci			if (data)
102162306a36Sopenharmony_ci				amdgpu_xgmi_query_pcs_error_status(adev, data,
102262306a36Sopenharmony_ci						mask_data, &ue_cnt, &ce_cnt, true, false);
102362306a36Sopenharmony_ci		}
102462306a36Sopenharmony_ci		/* check wafl pcs error */
102562306a36Sopenharmony_ci		for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) {
102662306a36Sopenharmony_ci			data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]);
102762306a36Sopenharmony_ci			if (data)
102862306a36Sopenharmony_ci				amdgpu_xgmi_query_pcs_error_status(adev, data,
102962306a36Sopenharmony_ci						mask_data, &ue_cnt, &ce_cnt, false, false);
103062306a36Sopenharmony_ci		}
103162306a36Sopenharmony_ci		break;
103262306a36Sopenharmony_ci	case CHIP_ALDEBARAN:
103362306a36Sopenharmony_ci		/* check xgmi3x16 pcs error */
103462306a36Sopenharmony_ci		for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) {
103562306a36Sopenharmony_ci			data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]);
103662306a36Sopenharmony_ci			mask_data =
103762306a36Sopenharmony_ci				RREG32_PCIE(xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[i]);
103862306a36Sopenharmony_ci			if (data)
103962306a36Sopenharmony_ci				amdgpu_xgmi_query_pcs_error_status(adev, data,
104062306a36Sopenharmony_ci						mask_data, &ue_cnt, &ce_cnt, true, true);
104162306a36Sopenharmony_ci		}
104262306a36Sopenharmony_ci		/* check wafl pcs error */
104362306a36Sopenharmony_ci		for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) {
104462306a36Sopenharmony_ci			data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]);
104562306a36Sopenharmony_ci			mask_data =
104662306a36Sopenharmony_ci				RREG32_PCIE(walf_pcs_err_noncorrectable_mask_reg_aldebaran[i]);
104762306a36Sopenharmony_ci			if (data)
104862306a36Sopenharmony_ci				amdgpu_xgmi_query_pcs_error_status(adev, data,
104962306a36Sopenharmony_ci						mask_data, &ue_cnt, &ce_cnt, false, true);
105062306a36Sopenharmony_ci		}
105162306a36Sopenharmony_ci		break;
105262306a36Sopenharmony_ci	default:
105362306a36Sopenharmony_ci		dev_warn(adev->dev, "XGMI RAS error query not supported");
105462306a36Sopenharmony_ci		break;
105562306a36Sopenharmony_ci	}
105662306a36Sopenharmony_ci
105762306a36Sopenharmony_ci	adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev);
105862306a36Sopenharmony_ci
105962306a36Sopenharmony_ci	err_data->ue_count += ue_cnt;
106062306a36Sopenharmony_ci	err_data->ce_count += ce_cnt;
106162306a36Sopenharmony_ci}
106262306a36Sopenharmony_ci
106362306a36Sopenharmony_ci/* Trigger XGMI/WAFL error */
106462306a36Sopenharmony_cistatic int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
106562306a36Sopenharmony_ci			void *inject_if, uint32_t instance_mask)
106662306a36Sopenharmony_ci{
106762306a36Sopenharmony_ci	int ret = 0;
106862306a36Sopenharmony_ci	struct ta_ras_trigger_error_input *block_info =
106962306a36Sopenharmony_ci				(struct ta_ras_trigger_error_input *)inject_if;
107062306a36Sopenharmony_ci
107162306a36Sopenharmony_ci	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
107262306a36Sopenharmony_ci		dev_warn(adev->dev, "Failed to disallow df cstate");
107362306a36Sopenharmony_ci
107462306a36Sopenharmony_ci	if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
107562306a36Sopenharmony_ci		dev_warn(adev->dev, "Failed to disallow XGMI power down");
107662306a36Sopenharmony_ci
107762306a36Sopenharmony_ci	ret = psp_ras_trigger_error(&adev->psp, block_info, instance_mask);
107862306a36Sopenharmony_ci
107962306a36Sopenharmony_ci	if (amdgpu_ras_intr_triggered())
108062306a36Sopenharmony_ci		return ret;
108162306a36Sopenharmony_ci
108262306a36Sopenharmony_ci	if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
108362306a36Sopenharmony_ci		dev_warn(adev->dev, "Failed to allow XGMI power down");
108462306a36Sopenharmony_ci
108562306a36Sopenharmony_ci	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
108662306a36Sopenharmony_ci		dev_warn(adev->dev, "Failed to allow df cstate");
108762306a36Sopenharmony_ci
108862306a36Sopenharmony_ci	return ret;
108962306a36Sopenharmony_ci}
109062306a36Sopenharmony_ci
109162306a36Sopenharmony_cistruct amdgpu_ras_block_hw_ops  xgmi_ras_hw_ops = {
109262306a36Sopenharmony_ci	.query_ras_error_count = amdgpu_xgmi_query_ras_error_count,
109362306a36Sopenharmony_ci	.reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count,
109462306a36Sopenharmony_ci	.ras_error_inject = amdgpu_ras_error_inject_xgmi,
109562306a36Sopenharmony_ci};
109662306a36Sopenharmony_ci
109762306a36Sopenharmony_cistruct amdgpu_xgmi_ras xgmi_ras = {
109862306a36Sopenharmony_ci	.ras_block = {
109962306a36Sopenharmony_ci		.hw_ops = &xgmi_ras_hw_ops,
110062306a36Sopenharmony_ci		.ras_late_init = amdgpu_xgmi_ras_late_init,
110162306a36Sopenharmony_ci	},
110262306a36Sopenharmony_ci};
110362306a36Sopenharmony_ci
110462306a36Sopenharmony_ciint amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev)
110562306a36Sopenharmony_ci{
110662306a36Sopenharmony_ci	int err;
110762306a36Sopenharmony_ci	struct amdgpu_xgmi_ras *ras;
110862306a36Sopenharmony_ci
110962306a36Sopenharmony_ci	if (!adev->gmc.xgmi.ras)
111062306a36Sopenharmony_ci		return 0;
111162306a36Sopenharmony_ci
111262306a36Sopenharmony_ci	ras = adev->gmc.xgmi.ras;
111362306a36Sopenharmony_ci	err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
111462306a36Sopenharmony_ci	if (err) {
111562306a36Sopenharmony_ci		dev_err(adev->dev, "Failed to register xgmi_wafl_pcs ras block!\n");
111662306a36Sopenharmony_ci		return err;
111762306a36Sopenharmony_ci	}
111862306a36Sopenharmony_ci
111962306a36Sopenharmony_ci	strcpy(ras->ras_block.ras_comm.name, "xgmi_wafl");
112062306a36Sopenharmony_ci	ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
112162306a36Sopenharmony_ci	ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
112262306a36Sopenharmony_ci	adev->gmc.xgmi.ras_if = &ras->ras_block.ras_comm;
112362306a36Sopenharmony_ci
112462306a36Sopenharmony_ci	return 0;
112562306a36Sopenharmony_ci}
1126