162306a36Sopenharmony_ci/* 262306a36Sopenharmony_ci * Copyright 2018 Advanced Micro Devices, Inc. 362306a36Sopenharmony_ci * 462306a36Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 562306a36Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 662306a36Sopenharmony_ci * to deal in the Software without restriction, including without limitation 762306a36Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 862306a36Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 962306a36Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 1062306a36Sopenharmony_ci * 1162306a36Sopenharmony_ci * The above copyright notice and this permission notice shall be included in 1262306a36Sopenharmony_ci * all copies or substantial portions of the Software. 1362306a36Sopenharmony_ci * 1462306a36Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1562306a36Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1662306a36Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1762306a36Sopenharmony_ci * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 1862306a36Sopenharmony_ci * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 1962306a36Sopenharmony_ci * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 2062306a36Sopenharmony_ci * OTHER DEALINGS IN THE SOFTWARE. 2162306a36Sopenharmony_ci * 2262306a36Sopenharmony_ci * 2362306a36Sopenharmony_ci */ 2462306a36Sopenharmony_ci#include <linux/list.h> 2562306a36Sopenharmony_ci#include "amdgpu.h" 2662306a36Sopenharmony_ci#include "amdgpu_xgmi.h" 2762306a36Sopenharmony_ci#include "amdgpu_ras.h" 2862306a36Sopenharmony_ci#include "soc15.h" 2962306a36Sopenharmony_ci#include "df/df_3_6_offset.h" 3062306a36Sopenharmony_ci#include "xgmi/xgmi_4_0_0_smn.h" 3162306a36Sopenharmony_ci#include "xgmi/xgmi_4_0_0_sh_mask.h" 3262306a36Sopenharmony_ci#include "xgmi/xgmi_6_1_0_sh_mask.h" 3362306a36Sopenharmony_ci#include "wafl/wafl2_4_0_0_smn.h" 3462306a36Sopenharmony_ci#include "wafl/wafl2_4_0_0_sh_mask.h" 3562306a36Sopenharmony_ci 3662306a36Sopenharmony_ci#include "amdgpu_reset.h" 3762306a36Sopenharmony_ci 3862306a36Sopenharmony_ci#define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c 3962306a36Sopenharmony_ci#define smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK 0x11a00218 4062306a36Sopenharmony_ci#define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210 4162306a36Sopenharmony_ci#define smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK 0x12200218 4262306a36Sopenharmony_ci 4362306a36Sopenharmony_cistatic DEFINE_MUTEX(xgmi_mutex); 4462306a36Sopenharmony_ci 4562306a36Sopenharmony_ci#define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4 4662306a36Sopenharmony_ci 4762306a36Sopenharmony_cistatic LIST_HEAD(xgmi_hive_list); 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_cistatic const int xgmi_pcs_err_status_reg_vg20[] = { 5062306a36Sopenharmony_ci smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, 5162306a36Sopenharmony_ci smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000, 5262306a36Sopenharmony_ci}; 5362306a36Sopenharmony_ci 5462306a36Sopenharmony_cistatic const int wafl_pcs_err_status_reg_vg20[] = { 5562306a36Sopenharmony_ci smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, 5662306a36Sopenharmony_ci smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, 5762306a36Sopenharmony_ci}; 5862306a36Sopenharmony_ci 5962306a36Sopenharmony_cistatic const int xgmi_pcs_err_status_reg_arct[] = { 6062306a36Sopenharmony_ci smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, 6162306a36Sopenharmony_ci smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000, 6262306a36Sopenharmony_ci smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000, 6362306a36Sopenharmony_ci smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000, 6462306a36Sopenharmony_ci smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000, 6562306a36Sopenharmony_ci smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000, 6662306a36Sopenharmony_ci}; 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_ci/* same as vg20*/ 6962306a36Sopenharmony_cistatic const int wafl_pcs_err_status_reg_arct[] = { 7062306a36Sopenharmony_ci smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, 7162306a36Sopenharmony_ci smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, 7262306a36Sopenharmony_ci}; 7362306a36Sopenharmony_ci 7462306a36Sopenharmony_cistatic const int xgmi3x16_pcs_err_status_reg_aldebaran[] = { 7562306a36Sopenharmony_ci smnPCS_XGMI3X16_PCS_ERROR_STATUS, 7662306a36Sopenharmony_ci smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000, 7762306a36Sopenharmony_ci smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x200000, 7862306a36Sopenharmony_ci smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x300000, 7962306a36Sopenharmony_ci smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x400000, 8062306a36Sopenharmony_ci smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x500000, 8162306a36Sopenharmony_ci smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x600000, 8262306a36Sopenharmony_ci smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000 8362306a36Sopenharmony_ci}; 8462306a36Sopenharmony_ci 8562306a36Sopenharmony_cistatic const int xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[] = { 8662306a36Sopenharmony_ci smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK, 8762306a36Sopenharmony_ci smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000, 8862306a36Sopenharmony_ci smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x200000, 8962306a36Sopenharmony_ci smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x300000, 9062306a36Sopenharmony_ci smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x400000, 9162306a36Sopenharmony_ci smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x500000, 9262306a36Sopenharmony_ci smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x600000, 9362306a36Sopenharmony_ci smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x700000 9462306a36Sopenharmony_ci}; 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_cistatic const int walf_pcs_err_status_reg_aldebaran[] = { 9762306a36Sopenharmony_ci smnPCS_GOPX1_PCS_ERROR_STATUS, 9862306a36Sopenharmony_ci smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000 9962306a36Sopenharmony_ci}; 10062306a36Sopenharmony_ci 10162306a36Sopenharmony_cistatic const int walf_pcs_err_noncorrectable_mask_reg_aldebaran[] = { 10262306a36Sopenharmony_ci smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK, 10362306a36Sopenharmony_ci smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000 10462306a36Sopenharmony_ci}; 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_cistatic const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = { 10762306a36Sopenharmony_ci {"XGMI PCS DataLossErr", 10862306a36Sopenharmony_ci SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)}, 10962306a36Sopenharmony_ci {"XGMI PCS TrainingErr", 11062306a36Sopenharmony_ci SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)}, 11162306a36Sopenharmony_ci {"XGMI PCS CRCErr", 11262306a36Sopenharmony_ci SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)}, 11362306a36Sopenharmony_ci {"XGMI PCS BERExceededErr", 11462306a36Sopenharmony_ci SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)}, 11562306a36Sopenharmony_ci {"XGMI PCS TxMetaDataErr", 11662306a36Sopenharmony_ci SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)}, 11762306a36Sopenharmony_ci {"XGMI PCS ReplayBufParityErr", 11862306a36Sopenharmony_ci SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)}, 11962306a36Sopenharmony_ci {"XGMI PCS DataParityErr", 12062306a36Sopenharmony_ci SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)}, 12162306a36Sopenharmony_ci {"XGMI PCS ReplayFifoOverflowErr", 12262306a36Sopenharmony_ci SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, 12362306a36Sopenharmony_ci {"XGMI PCS ReplayFifoUnderflowErr", 12462306a36Sopenharmony_ci SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, 12562306a36Sopenharmony_ci {"XGMI PCS ElasticFifoOverflowErr", 12662306a36Sopenharmony_ci SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, 12762306a36Sopenharmony_ci {"XGMI PCS DeskewErr", 12862306a36Sopenharmony_ci SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)}, 12962306a36Sopenharmony_ci {"XGMI PCS DataStartupLimitErr", 13062306a36Sopenharmony_ci SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)}, 13162306a36Sopenharmony_ci {"XGMI PCS FCInitTimeoutErr", 13262306a36Sopenharmony_ci SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)}, 13362306a36Sopenharmony_ci {"XGMI PCS RecoveryTimeoutErr", 13462306a36Sopenharmony_ci SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, 13562306a36Sopenharmony_ci {"XGMI PCS ReadySerialTimeoutErr", 13662306a36Sopenharmony_ci SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, 13762306a36Sopenharmony_ci {"XGMI PCS ReadySerialAttemptErr", 13862306a36Sopenharmony_ci SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, 13962306a36Sopenharmony_ci {"XGMI PCS RecoveryAttemptErr", 14062306a36Sopenharmony_ci SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)}, 14162306a36Sopenharmony_ci {"XGMI PCS RecoveryRelockAttemptErr", 14262306a36Sopenharmony_ci SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, 14362306a36Sopenharmony_ci}; 14462306a36Sopenharmony_ci 14562306a36Sopenharmony_cistatic const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = { 14662306a36Sopenharmony_ci {"WAFL PCS DataLossErr", 14762306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)}, 14862306a36Sopenharmony_ci {"WAFL PCS TrainingErr", 14962306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)}, 15062306a36Sopenharmony_ci {"WAFL PCS CRCErr", 15162306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)}, 15262306a36Sopenharmony_ci {"WAFL PCS BERExceededErr", 15362306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)}, 15462306a36Sopenharmony_ci {"WAFL PCS TxMetaDataErr", 15562306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)}, 15662306a36Sopenharmony_ci {"WAFL PCS ReplayBufParityErr", 15762306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)}, 15862306a36Sopenharmony_ci {"WAFL PCS DataParityErr", 15962306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)}, 16062306a36Sopenharmony_ci {"WAFL PCS ReplayFifoOverflowErr", 16162306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, 16262306a36Sopenharmony_ci {"WAFL PCS ReplayFifoUnderflowErr", 16362306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, 16462306a36Sopenharmony_ci {"WAFL PCS ElasticFifoOverflowErr", 16562306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, 16662306a36Sopenharmony_ci {"WAFL PCS DeskewErr", 16762306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)}, 16862306a36Sopenharmony_ci {"WAFL PCS DataStartupLimitErr", 16962306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)}, 17062306a36Sopenharmony_ci {"WAFL PCS FCInitTimeoutErr", 17162306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)}, 17262306a36Sopenharmony_ci {"WAFL PCS RecoveryTimeoutErr", 17362306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, 17462306a36Sopenharmony_ci {"WAFL PCS ReadySerialTimeoutErr", 17562306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, 17662306a36Sopenharmony_ci {"WAFL PCS ReadySerialAttemptErr", 17762306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, 17862306a36Sopenharmony_ci {"WAFL PCS RecoveryAttemptErr", 17962306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)}, 18062306a36Sopenharmony_ci {"WAFL PCS RecoveryRelockAttemptErr", 18162306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, 18262306a36Sopenharmony_ci}; 18362306a36Sopenharmony_ci 18462306a36Sopenharmony_cistatic const struct amdgpu_pcs_ras_field xgmi3x16_pcs_ras_fields[] = { 18562306a36Sopenharmony_ci {"XGMI3X16 PCS DataLossErr", 18662306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataLossErr)}, 18762306a36Sopenharmony_ci {"XGMI3X16 PCS TrainingErr", 18862306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TrainingErr)}, 18962306a36Sopenharmony_ci {"XGMI3X16 PCS FlowCtrlAckErr", 19062306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FlowCtrlAckErr)}, 19162306a36Sopenharmony_ci {"XGMI3X16 PCS RxFifoUnderflowErr", 19262306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxFifoUnderflowErr)}, 19362306a36Sopenharmony_ci {"XGMI3X16 PCS RxFifoOverflowErr", 19462306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxFifoOverflowErr)}, 19562306a36Sopenharmony_ci {"XGMI3X16 PCS CRCErr", 19662306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, CRCErr)}, 19762306a36Sopenharmony_ci {"XGMI3X16 PCS BERExceededErr", 19862306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, BERExceededErr)}, 19962306a36Sopenharmony_ci {"XGMI3X16 PCS TxVcidDataErr", 20062306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TxVcidDataErr)}, 20162306a36Sopenharmony_ci {"XGMI3X16 PCS ReplayBufParityErr", 20262306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayBufParityErr)}, 20362306a36Sopenharmony_ci {"XGMI3X16 PCS DataParityErr", 20462306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataParityErr)}, 20562306a36Sopenharmony_ci {"XGMI3X16 PCS ReplayFifoOverflowErr", 20662306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, 20762306a36Sopenharmony_ci {"XGMI3X16 PCS ReplayFifoUnderflowErr", 20862306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, 20962306a36Sopenharmony_ci {"XGMI3X16 PCS ElasticFifoOverflowErr", 21062306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, 21162306a36Sopenharmony_ci {"XGMI3X16 PCS DeskewErr", 21262306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DeskewErr)}, 21362306a36Sopenharmony_ci {"XGMI3X16 PCS FlowCtrlCRCErr", 21462306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FlowCtrlCRCErr)}, 21562306a36Sopenharmony_ci {"XGMI3X16 PCS DataStartupLimitErr", 21662306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataStartupLimitErr)}, 21762306a36Sopenharmony_ci {"XGMI3X16 PCS FCInitTimeoutErr", 21862306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FCInitTimeoutErr)}, 21962306a36Sopenharmony_ci {"XGMI3X16 PCS RecoveryTimeoutErr", 22062306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, 22162306a36Sopenharmony_ci {"XGMI3X16 PCS ReadySerialTimeoutErr", 22262306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, 22362306a36Sopenharmony_ci {"XGMI3X16 PCS ReadySerialAttemptErr", 22462306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, 22562306a36Sopenharmony_ci {"XGMI3X16 PCS RecoveryAttemptErr", 22662306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryAttemptErr)}, 22762306a36Sopenharmony_ci {"XGMI3X16 PCS RecoveryRelockAttemptErr", 22862306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, 22962306a36Sopenharmony_ci {"XGMI3X16 PCS ReplayAttemptErr", 23062306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayAttemptErr)}, 23162306a36Sopenharmony_ci {"XGMI3X16 PCS SyncHdrErr", 23262306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, SyncHdrErr)}, 23362306a36Sopenharmony_ci {"XGMI3X16 PCS TxReplayTimeoutErr", 23462306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TxReplayTimeoutErr)}, 23562306a36Sopenharmony_ci {"XGMI3X16 PCS RxReplayTimeoutErr", 23662306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxReplayTimeoutErr)}, 23762306a36Sopenharmony_ci {"XGMI3X16 PCS LinkSubTxTimeoutErr", 23862306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, LinkSubTxTimeoutErr)}, 23962306a36Sopenharmony_ci {"XGMI3X16 PCS LinkSubRxTimeoutErr", 24062306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, LinkSubRxTimeoutErr)}, 24162306a36Sopenharmony_ci {"XGMI3X16 PCS RxCMDPktErr", 24262306a36Sopenharmony_ci SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxCMDPktErr)}, 24362306a36Sopenharmony_ci}; 24462306a36Sopenharmony_ci 24562306a36Sopenharmony_ci/** 24662306a36Sopenharmony_ci * DOC: AMDGPU XGMI Support 24762306a36Sopenharmony_ci * 24862306a36Sopenharmony_ci * XGMI is a high speed interconnect that joins multiple GPU cards 24962306a36Sopenharmony_ci * into a homogeneous memory space that is organized by a collective 25062306a36Sopenharmony_ci * hive ID and individual node IDs, both of which are 64-bit numbers. 25162306a36Sopenharmony_ci * 25262306a36Sopenharmony_ci * The file xgmi_device_id contains the unique per GPU device ID and 25362306a36Sopenharmony_ci * is stored in the /sys/class/drm/card${cardno}/device/ directory. 25462306a36Sopenharmony_ci * 25562306a36Sopenharmony_ci * Inside the device directory a sub-directory 'xgmi_hive_info' is 25662306a36Sopenharmony_ci * created which contains the hive ID and the list of nodes. 25762306a36Sopenharmony_ci * 25862306a36Sopenharmony_ci * The hive ID is stored in: 25962306a36Sopenharmony_ci * /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id 26062306a36Sopenharmony_ci * 26162306a36Sopenharmony_ci * The node information is stored in numbered directories: 26262306a36Sopenharmony_ci * /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id 26362306a36Sopenharmony_ci * 26462306a36Sopenharmony_ci * Each device has their own xgmi_hive_info direction with a mirror 26562306a36Sopenharmony_ci * set of node sub-directories. 26662306a36Sopenharmony_ci * 26762306a36Sopenharmony_ci * The XGMI memory space is built by contiguously adding the power of 26862306a36Sopenharmony_ci * two padded VRAM space from each node to each other. 26962306a36Sopenharmony_ci * 27062306a36Sopenharmony_ci */ 27162306a36Sopenharmony_ci 27262306a36Sopenharmony_cistatic struct attribute amdgpu_xgmi_hive_id = { 27362306a36Sopenharmony_ci .name = "xgmi_hive_id", 27462306a36Sopenharmony_ci .mode = S_IRUGO 27562306a36Sopenharmony_ci}; 27662306a36Sopenharmony_ci 27762306a36Sopenharmony_cistatic struct attribute *amdgpu_xgmi_hive_attrs[] = { 27862306a36Sopenharmony_ci &amdgpu_xgmi_hive_id, 27962306a36Sopenharmony_ci NULL 28062306a36Sopenharmony_ci}; 28162306a36Sopenharmony_ciATTRIBUTE_GROUPS(amdgpu_xgmi_hive); 28262306a36Sopenharmony_ci 28362306a36Sopenharmony_cistatic ssize_t amdgpu_xgmi_show_attrs(struct kobject *kobj, 28462306a36Sopenharmony_ci struct attribute *attr, char *buf) 28562306a36Sopenharmony_ci{ 28662306a36Sopenharmony_ci struct amdgpu_hive_info *hive = container_of( 28762306a36Sopenharmony_ci kobj, struct amdgpu_hive_info, kobj); 28862306a36Sopenharmony_ci 28962306a36Sopenharmony_ci if (attr == &amdgpu_xgmi_hive_id) 29062306a36Sopenharmony_ci return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id); 29162306a36Sopenharmony_ci 29262306a36Sopenharmony_ci return 0; 29362306a36Sopenharmony_ci} 29462306a36Sopenharmony_ci 29562306a36Sopenharmony_cistatic void amdgpu_xgmi_hive_release(struct kobject *kobj) 29662306a36Sopenharmony_ci{ 29762306a36Sopenharmony_ci struct amdgpu_hive_info *hive = container_of( 29862306a36Sopenharmony_ci kobj, struct amdgpu_hive_info, kobj); 29962306a36Sopenharmony_ci 30062306a36Sopenharmony_ci amdgpu_reset_put_reset_domain(hive->reset_domain); 30162306a36Sopenharmony_ci hive->reset_domain = NULL; 30262306a36Sopenharmony_ci 30362306a36Sopenharmony_ci mutex_destroy(&hive->hive_lock); 30462306a36Sopenharmony_ci kfree(hive); 30562306a36Sopenharmony_ci} 30662306a36Sopenharmony_ci 30762306a36Sopenharmony_cistatic const struct sysfs_ops amdgpu_xgmi_hive_ops = { 30862306a36Sopenharmony_ci .show = amdgpu_xgmi_show_attrs, 30962306a36Sopenharmony_ci}; 31062306a36Sopenharmony_ci 31162306a36Sopenharmony_cistatic const struct kobj_type amdgpu_xgmi_hive_type = { 31262306a36Sopenharmony_ci .release = amdgpu_xgmi_hive_release, 31362306a36Sopenharmony_ci .sysfs_ops = &amdgpu_xgmi_hive_ops, 31462306a36Sopenharmony_ci .default_groups = amdgpu_xgmi_hive_groups, 31562306a36Sopenharmony_ci}; 31662306a36Sopenharmony_ci 31762306a36Sopenharmony_cistatic ssize_t amdgpu_xgmi_show_device_id(struct device *dev, 31862306a36Sopenharmony_ci struct device_attribute *attr, 31962306a36Sopenharmony_ci char *buf) 32062306a36Sopenharmony_ci{ 32162306a36Sopenharmony_ci struct drm_device *ddev = dev_get_drvdata(dev); 32262306a36Sopenharmony_ci struct amdgpu_device *adev = drm_to_adev(ddev); 32362306a36Sopenharmony_ci 32462306a36Sopenharmony_ci return sysfs_emit(buf, "%llu\n", adev->gmc.xgmi.node_id); 32562306a36Sopenharmony_ci 32662306a36Sopenharmony_ci} 32762306a36Sopenharmony_ci 32862306a36Sopenharmony_cistatic ssize_t amdgpu_xgmi_show_num_hops(struct device *dev, 32962306a36Sopenharmony_ci struct device_attribute *attr, 33062306a36Sopenharmony_ci char *buf) 33162306a36Sopenharmony_ci{ 33262306a36Sopenharmony_ci struct drm_device *ddev = dev_get_drvdata(dev); 33362306a36Sopenharmony_ci struct amdgpu_device *adev = drm_to_adev(ddev); 33462306a36Sopenharmony_ci struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; 33562306a36Sopenharmony_ci int i; 33662306a36Sopenharmony_ci 33762306a36Sopenharmony_ci for (i = 0; i < top->num_nodes; i++) 33862306a36Sopenharmony_ci sprintf(buf + 3 * i, "%02x ", top->nodes[i].num_hops); 33962306a36Sopenharmony_ci 34062306a36Sopenharmony_ci return sysfs_emit(buf, "%s\n", buf); 34162306a36Sopenharmony_ci} 34262306a36Sopenharmony_ci 34362306a36Sopenharmony_cistatic ssize_t amdgpu_xgmi_show_num_links(struct device *dev, 34462306a36Sopenharmony_ci struct device_attribute *attr, 34562306a36Sopenharmony_ci char *buf) 34662306a36Sopenharmony_ci{ 34762306a36Sopenharmony_ci struct drm_device *ddev = dev_get_drvdata(dev); 34862306a36Sopenharmony_ci struct amdgpu_device *adev = drm_to_adev(ddev); 34962306a36Sopenharmony_ci struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; 35062306a36Sopenharmony_ci int i; 35162306a36Sopenharmony_ci 35262306a36Sopenharmony_ci for (i = 0; i < top->num_nodes; i++) 35362306a36Sopenharmony_ci sprintf(buf + 3 * i, "%02x ", top->nodes[i].num_links); 35462306a36Sopenharmony_ci 35562306a36Sopenharmony_ci return sysfs_emit(buf, "%s\n", buf); 35662306a36Sopenharmony_ci} 35762306a36Sopenharmony_ci 35862306a36Sopenharmony_ci#define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801) 35962306a36Sopenharmony_cistatic ssize_t amdgpu_xgmi_show_error(struct device *dev, 36062306a36Sopenharmony_ci struct device_attribute *attr, 36162306a36Sopenharmony_ci char *buf) 36262306a36Sopenharmony_ci{ 36362306a36Sopenharmony_ci struct drm_device *ddev = dev_get_drvdata(dev); 36462306a36Sopenharmony_ci struct amdgpu_device *adev = drm_to_adev(ddev); 36562306a36Sopenharmony_ci uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in; 36662306a36Sopenharmony_ci uint64_t fica_out; 36762306a36Sopenharmony_ci unsigned int error_count = 0; 36862306a36Sopenharmony_ci 36962306a36Sopenharmony_ci ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200); 37062306a36Sopenharmony_ci ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208); 37162306a36Sopenharmony_ci 37262306a36Sopenharmony_ci if ((!adev->df.funcs) || 37362306a36Sopenharmony_ci (!adev->df.funcs->get_fica) || 37462306a36Sopenharmony_ci (!adev->df.funcs->set_fica)) 37562306a36Sopenharmony_ci return -EINVAL; 37662306a36Sopenharmony_ci 37762306a36Sopenharmony_ci fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in); 37862306a36Sopenharmony_ci if (fica_out != 0x1f) 37962306a36Sopenharmony_ci pr_err("xGMI error counters not enabled!\n"); 38062306a36Sopenharmony_ci 38162306a36Sopenharmony_ci fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in); 38262306a36Sopenharmony_ci 38362306a36Sopenharmony_ci if ((fica_out & 0xffff) == 2) 38462306a36Sopenharmony_ci error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63); 38562306a36Sopenharmony_ci 38662306a36Sopenharmony_ci adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0); 38762306a36Sopenharmony_ci 38862306a36Sopenharmony_ci return sysfs_emit(buf, "%u\n", error_count); 38962306a36Sopenharmony_ci} 39062306a36Sopenharmony_ci 39162306a36Sopenharmony_ci 39262306a36Sopenharmony_cistatic DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL); 39362306a36Sopenharmony_cistatic DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL); 39462306a36Sopenharmony_cistatic DEVICE_ATTR(xgmi_num_hops, S_IRUGO, amdgpu_xgmi_show_num_hops, NULL); 39562306a36Sopenharmony_cistatic DEVICE_ATTR(xgmi_num_links, S_IRUGO, amdgpu_xgmi_show_num_links, NULL); 39662306a36Sopenharmony_ci 39762306a36Sopenharmony_cistatic int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, 39862306a36Sopenharmony_ci struct amdgpu_hive_info *hive) 39962306a36Sopenharmony_ci{ 40062306a36Sopenharmony_ci int ret = 0; 40162306a36Sopenharmony_ci char node[10] = { 0 }; 40262306a36Sopenharmony_ci 40362306a36Sopenharmony_ci /* Create xgmi device id file */ 40462306a36Sopenharmony_ci ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id); 40562306a36Sopenharmony_ci if (ret) { 40662306a36Sopenharmony_ci dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n"); 40762306a36Sopenharmony_ci return ret; 40862306a36Sopenharmony_ci } 40962306a36Sopenharmony_ci 41062306a36Sopenharmony_ci /* Create xgmi error file */ 41162306a36Sopenharmony_ci ret = device_create_file(adev->dev, &dev_attr_xgmi_error); 41262306a36Sopenharmony_ci if (ret) 41362306a36Sopenharmony_ci pr_err("failed to create xgmi_error\n"); 41462306a36Sopenharmony_ci 41562306a36Sopenharmony_ci /* Create xgmi num hops file */ 41662306a36Sopenharmony_ci ret = device_create_file(adev->dev, &dev_attr_xgmi_num_hops); 41762306a36Sopenharmony_ci if (ret) 41862306a36Sopenharmony_ci pr_err("failed to create xgmi_num_hops\n"); 41962306a36Sopenharmony_ci 42062306a36Sopenharmony_ci /* Create xgmi num links file */ 42162306a36Sopenharmony_ci ret = device_create_file(adev->dev, &dev_attr_xgmi_num_links); 42262306a36Sopenharmony_ci if (ret) 42362306a36Sopenharmony_ci pr_err("failed to create xgmi_num_links\n"); 42462306a36Sopenharmony_ci 42562306a36Sopenharmony_ci /* Create sysfs link to hive info folder on the first device */ 42662306a36Sopenharmony_ci if (hive->kobj.parent != (&adev->dev->kobj)) { 42762306a36Sopenharmony_ci ret = sysfs_create_link(&adev->dev->kobj, &hive->kobj, 42862306a36Sopenharmony_ci "xgmi_hive_info"); 42962306a36Sopenharmony_ci if (ret) { 43062306a36Sopenharmony_ci dev_err(adev->dev, "XGMI: Failed to create link to hive info"); 43162306a36Sopenharmony_ci goto remove_file; 43262306a36Sopenharmony_ci } 43362306a36Sopenharmony_ci } 43462306a36Sopenharmony_ci 43562306a36Sopenharmony_ci sprintf(node, "node%d", atomic_read(&hive->number_devices)); 43662306a36Sopenharmony_ci /* Create sysfs link form the hive folder to yourself */ 43762306a36Sopenharmony_ci ret = sysfs_create_link(&hive->kobj, &adev->dev->kobj, node); 43862306a36Sopenharmony_ci if (ret) { 43962306a36Sopenharmony_ci dev_err(adev->dev, "XGMI: Failed to create link from hive info"); 44062306a36Sopenharmony_ci goto remove_link; 44162306a36Sopenharmony_ci } 44262306a36Sopenharmony_ci 44362306a36Sopenharmony_ci goto success; 44462306a36Sopenharmony_ci 44562306a36Sopenharmony_ci 44662306a36Sopenharmony_ciremove_link: 44762306a36Sopenharmony_ci sysfs_remove_link(&adev->dev->kobj, adev_to_drm(adev)->unique); 44862306a36Sopenharmony_ci 44962306a36Sopenharmony_ciremove_file: 45062306a36Sopenharmony_ci device_remove_file(adev->dev, &dev_attr_xgmi_device_id); 45162306a36Sopenharmony_ci device_remove_file(adev->dev, &dev_attr_xgmi_error); 45262306a36Sopenharmony_ci device_remove_file(adev->dev, &dev_attr_xgmi_num_hops); 45362306a36Sopenharmony_ci device_remove_file(adev->dev, &dev_attr_xgmi_num_links); 45462306a36Sopenharmony_ci 45562306a36Sopenharmony_cisuccess: 45662306a36Sopenharmony_ci return ret; 45762306a36Sopenharmony_ci} 45862306a36Sopenharmony_ci 45962306a36Sopenharmony_cistatic void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev, 46062306a36Sopenharmony_ci struct amdgpu_hive_info *hive) 46162306a36Sopenharmony_ci{ 46262306a36Sopenharmony_ci char node[10]; 46362306a36Sopenharmony_ci memset(node, 0, sizeof(node)); 46462306a36Sopenharmony_ci 46562306a36Sopenharmony_ci device_remove_file(adev->dev, &dev_attr_xgmi_device_id); 46662306a36Sopenharmony_ci device_remove_file(adev->dev, &dev_attr_xgmi_error); 46762306a36Sopenharmony_ci device_remove_file(adev->dev, &dev_attr_xgmi_num_hops); 46862306a36Sopenharmony_ci device_remove_file(adev->dev, &dev_attr_xgmi_num_links); 46962306a36Sopenharmony_ci 47062306a36Sopenharmony_ci if (hive->kobj.parent != (&adev->dev->kobj)) 47162306a36Sopenharmony_ci sysfs_remove_link(&adev->dev->kobj,"xgmi_hive_info"); 47262306a36Sopenharmony_ci 47362306a36Sopenharmony_ci sprintf(node, "node%d", atomic_read(&hive->number_devices)); 47462306a36Sopenharmony_ci sysfs_remove_link(&hive->kobj, node); 47562306a36Sopenharmony_ci 47662306a36Sopenharmony_ci} 47762306a36Sopenharmony_ci 47862306a36Sopenharmony_ci 47962306a36Sopenharmony_ci 48062306a36Sopenharmony_cistruct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) 48162306a36Sopenharmony_ci{ 48262306a36Sopenharmony_ci struct amdgpu_hive_info *hive = NULL; 48362306a36Sopenharmony_ci int ret; 48462306a36Sopenharmony_ci 48562306a36Sopenharmony_ci if (!adev->gmc.xgmi.hive_id) 48662306a36Sopenharmony_ci return NULL; 48762306a36Sopenharmony_ci 48862306a36Sopenharmony_ci if (adev->hive) { 48962306a36Sopenharmony_ci kobject_get(&adev->hive->kobj); 49062306a36Sopenharmony_ci return adev->hive; 49162306a36Sopenharmony_ci } 49262306a36Sopenharmony_ci 49362306a36Sopenharmony_ci mutex_lock(&xgmi_mutex); 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_ci list_for_each_entry(hive, &xgmi_hive_list, node) { 49662306a36Sopenharmony_ci if (hive->hive_id == adev->gmc.xgmi.hive_id) 49762306a36Sopenharmony_ci goto pro_end; 49862306a36Sopenharmony_ci } 49962306a36Sopenharmony_ci 50062306a36Sopenharmony_ci hive = kzalloc(sizeof(*hive), GFP_KERNEL); 50162306a36Sopenharmony_ci if (!hive) { 50262306a36Sopenharmony_ci dev_err(adev->dev, "XGMI: allocation failed\n"); 50362306a36Sopenharmony_ci ret = -ENOMEM; 50462306a36Sopenharmony_ci hive = NULL; 50562306a36Sopenharmony_ci goto pro_end; 50662306a36Sopenharmony_ci } 50762306a36Sopenharmony_ci 50862306a36Sopenharmony_ci /* initialize new hive if not exist */ 50962306a36Sopenharmony_ci ret = kobject_init_and_add(&hive->kobj, 51062306a36Sopenharmony_ci &amdgpu_xgmi_hive_type, 51162306a36Sopenharmony_ci &adev->dev->kobj, 51262306a36Sopenharmony_ci "%s", "xgmi_hive_info"); 51362306a36Sopenharmony_ci if (ret) { 51462306a36Sopenharmony_ci dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi hive\n"); 51562306a36Sopenharmony_ci kobject_put(&hive->kobj); 51662306a36Sopenharmony_ci hive = NULL; 51762306a36Sopenharmony_ci goto pro_end; 51862306a36Sopenharmony_ci } 51962306a36Sopenharmony_ci 52062306a36Sopenharmony_ci /** 52162306a36Sopenharmony_ci * Only init hive->reset_domain for none SRIOV configuration. For SRIOV, 52262306a36Sopenharmony_ci * Host driver decide how to reset the GPU either through FLR or chain reset. 52362306a36Sopenharmony_ci * Guest side will get individual notifications from the host for the FLR 52462306a36Sopenharmony_ci * if necessary. 52562306a36Sopenharmony_ci */ 52662306a36Sopenharmony_ci if (!amdgpu_sriov_vf(adev)) { 52762306a36Sopenharmony_ci /** 52862306a36Sopenharmony_ci * Avoid recreating reset domain when hive is reconstructed for the case 52962306a36Sopenharmony_ci * of reset the devices in the XGMI hive during probe for passthrough GPU 53062306a36Sopenharmony_ci * See https://www.spinics.net/lists/amd-gfx/msg58836.html 53162306a36Sopenharmony_ci */ 53262306a36Sopenharmony_ci if (adev->reset_domain->type != XGMI_HIVE) { 53362306a36Sopenharmony_ci hive->reset_domain = 53462306a36Sopenharmony_ci amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive"); 53562306a36Sopenharmony_ci if (!hive->reset_domain) { 53662306a36Sopenharmony_ci dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n"); 53762306a36Sopenharmony_ci ret = -ENOMEM; 53862306a36Sopenharmony_ci kobject_put(&hive->kobj); 53962306a36Sopenharmony_ci hive = NULL; 54062306a36Sopenharmony_ci goto pro_end; 54162306a36Sopenharmony_ci } 54262306a36Sopenharmony_ci } else { 54362306a36Sopenharmony_ci amdgpu_reset_get_reset_domain(adev->reset_domain); 54462306a36Sopenharmony_ci hive->reset_domain = adev->reset_domain; 54562306a36Sopenharmony_ci } 54662306a36Sopenharmony_ci } 54762306a36Sopenharmony_ci 54862306a36Sopenharmony_ci hive->hive_id = adev->gmc.xgmi.hive_id; 54962306a36Sopenharmony_ci INIT_LIST_HEAD(&hive->device_list); 55062306a36Sopenharmony_ci INIT_LIST_HEAD(&hive->node); 55162306a36Sopenharmony_ci mutex_init(&hive->hive_lock); 55262306a36Sopenharmony_ci atomic_set(&hive->number_devices, 0); 55362306a36Sopenharmony_ci task_barrier_init(&hive->tb); 55462306a36Sopenharmony_ci hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; 55562306a36Sopenharmony_ci hive->hi_req_gpu = NULL; 55662306a36Sopenharmony_ci 55762306a36Sopenharmony_ci /* 55862306a36Sopenharmony_ci * hive pstate on boot is high in vega20 so we have to go to low 55962306a36Sopenharmony_ci * pstate on after boot. 56062306a36Sopenharmony_ci */ 56162306a36Sopenharmony_ci hive->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE; 56262306a36Sopenharmony_ci list_add_tail(&hive->node, &xgmi_hive_list); 56362306a36Sopenharmony_ci 56462306a36Sopenharmony_cipro_end: 56562306a36Sopenharmony_ci if (hive) 56662306a36Sopenharmony_ci kobject_get(&hive->kobj); 56762306a36Sopenharmony_ci mutex_unlock(&xgmi_mutex); 56862306a36Sopenharmony_ci return hive; 56962306a36Sopenharmony_ci} 57062306a36Sopenharmony_ci 57162306a36Sopenharmony_civoid amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive) 57262306a36Sopenharmony_ci{ 57362306a36Sopenharmony_ci if (hive) 57462306a36Sopenharmony_ci kobject_put(&hive->kobj); 57562306a36Sopenharmony_ci} 57662306a36Sopenharmony_ci 57762306a36Sopenharmony_ciint amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate) 57862306a36Sopenharmony_ci{ 57962306a36Sopenharmony_ci int ret = 0; 58062306a36Sopenharmony_ci struct amdgpu_hive_info *hive; 58162306a36Sopenharmony_ci struct amdgpu_device *request_adev; 58262306a36Sopenharmony_ci bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20; 58362306a36Sopenharmony_ci bool init_low; 58462306a36Sopenharmony_ci 58562306a36Sopenharmony_ci hive = amdgpu_get_xgmi_hive(adev); 58662306a36Sopenharmony_ci if (!hive) 58762306a36Sopenharmony_ci return 0; 58862306a36Sopenharmony_ci 58962306a36Sopenharmony_ci request_adev = hive->hi_req_gpu ? hive->hi_req_gpu : adev; 59062306a36Sopenharmony_ci init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN; 59162306a36Sopenharmony_ci amdgpu_put_xgmi_hive(hive); 59262306a36Sopenharmony_ci /* fw bug so temporarily disable pstate switching */ 59362306a36Sopenharmony_ci return 0; 59462306a36Sopenharmony_ci 59562306a36Sopenharmony_ci if (!hive || adev->asic_type != CHIP_VEGA20) 59662306a36Sopenharmony_ci return 0; 59762306a36Sopenharmony_ci 59862306a36Sopenharmony_ci mutex_lock(&hive->hive_lock); 59962306a36Sopenharmony_ci 60062306a36Sopenharmony_ci if (is_hi_req) 60162306a36Sopenharmony_ci hive->hi_req_count++; 60262306a36Sopenharmony_ci else 60362306a36Sopenharmony_ci hive->hi_req_count--; 60462306a36Sopenharmony_ci 60562306a36Sopenharmony_ci /* 60662306a36Sopenharmony_ci * Vega20 only needs single peer to request pstate high for the hive to 60762306a36Sopenharmony_ci * go high but all peers must request pstate low for the hive to go low 60862306a36Sopenharmony_ci */ 60962306a36Sopenharmony_ci if (hive->pstate == pstate || 61062306a36Sopenharmony_ci (!is_hi_req && hive->hi_req_count && !init_low)) 61162306a36Sopenharmony_ci goto out; 61262306a36Sopenharmony_ci 61362306a36Sopenharmony_ci dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate); 61462306a36Sopenharmony_ci 61562306a36Sopenharmony_ci ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate); 61662306a36Sopenharmony_ci if (ret) { 61762306a36Sopenharmony_ci dev_err(request_adev->dev, 61862306a36Sopenharmony_ci "XGMI: Set pstate failure on device %llx, hive %llx, ret %d", 61962306a36Sopenharmony_ci request_adev->gmc.xgmi.node_id, 62062306a36Sopenharmony_ci request_adev->gmc.xgmi.hive_id, ret); 62162306a36Sopenharmony_ci goto out; 62262306a36Sopenharmony_ci } 62362306a36Sopenharmony_ci 62462306a36Sopenharmony_ci if (init_low) 62562306a36Sopenharmony_ci hive->pstate = hive->hi_req_count ? 62662306a36Sopenharmony_ci hive->pstate : AMDGPU_XGMI_PSTATE_MIN; 62762306a36Sopenharmony_ci else { 62862306a36Sopenharmony_ci hive->pstate = pstate; 62962306a36Sopenharmony_ci hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ? 63062306a36Sopenharmony_ci adev : NULL; 63162306a36Sopenharmony_ci } 63262306a36Sopenharmony_ciout: 63362306a36Sopenharmony_ci mutex_unlock(&hive->hive_lock); 63462306a36Sopenharmony_ci return ret; 63562306a36Sopenharmony_ci} 63662306a36Sopenharmony_ci 63762306a36Sopenharmony_ciint amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev) 63862306a36Sopenharmony_ci{ 63962306a36Sopenharmony_ci int ret; 64062306a36Sopenharmony_ci 64162306a36Sopenharmony_ci if (amdgpu_sriov_vf(adev)) 64262306a36Sopenharmony_ci return 0; 64362306a36Sopenharmony_ci 64462306a36Sopenharmony_ci /* Each psp need to set the latest topology */ 64562306a36Sopenharmony_ci ret = psp_xgmi_set_topology_info(&adev->psp, 64662306a36Sopenharmony_ci atomic_read(&hive->number_devices), 64762306a36Sopenharmony_ci &adev->psp.xgmi_context.top_info); 64862306a36Sopenharmony_ci if (ret) 64962306a36Sopenharmony_ci dev_err(adev->dev, 65062306a36Sopenharmony_ci "XGMI: Set topology failure on device %llx, hive %llx, ret %d", 65162306a36Sopenharmony_ci adev->gmc.xgmi.node_id, 65262306a36Sopenharmony_ci adev->gmc.xgmi.hive_id, ret); 65362306a36Sopenharmony_ci 65462306a36Sopenharmony_ci return ret; 65562306a36Sopenharmony_ci} 65662306a36Sopenharmony_ci 65762306a36Sopenharmony_ci 65862306a36Sopenharmony_ci/* 65962306a36Sopenharmony_ci * NOTE psp_xgmi_node_info.num_hops layout is as follows: 66062306a36Sopenharmony_ci * num_hops[7:6] = link type (0 = xGMI2, 1 = xGMI3, 2/3 = reserved) 66162306a36Sopenharmony_ci * num_hops[5:3] = reserved 66262306a36Sopenharmony_ci * num_hops[2:0] = number of hops 66362306a36Sopenharmony_ci */ 66462306a36Sopenharmony_ciint amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev, 66562306a36Sopenharmony_ci struct amdgpu_device *peer_adev) 66662306a36Sopenharmony_ci{ 66762306a36Sopenharmony_ci struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; 66862306a36Sopenharmony_ci uint8_t num_hops_mask = 0x7; 66962306a36Sopenharmony_ci int i; 67062306a36Sopenharmony_ci 67162306a36Sopenharmony_ci for (i = 0 ; i < top->num_nodes; ++i) 67262306a36Sopenharmony_ci if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) 67362306a36Sopenharmony_ci return top->nodes[i].num_hops & num_hops_mask; 67462306a36Sopenharmony_ci return -EINVAL; 67562306a36Sopenharmony_ci} 67662306a36Sopenharmony_ci 67762306a36Sopenharmony_ciint amdgpu_xgmi_get_num_links(struct amdgpu_device *adev, 67862306a36Sopenharmony_ci struct amdgpu_device *peer_adev) 67962306a36Sopenharmony_ci{ 68062306a36Sopenharmony_ci struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; 68162306a36Sopenharmony_ci int i; 68262306a36Sopenharmony_ci 68362306a36Sopenharmony_ci for (i = 0 ; i < top->num_nodes; ++i) 68462306a36Sopenharmony_ci if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) 68562306a36Sopenharmony_ci return top->nodes[i].num_links; 68662306a36Sopenharmony_ci return -EINVAL; 68762306a36Sopenharmony_ci} 68862306a36Sopenharmony_ci 68962306a36Sopenharmony_ci/* 69062306a36Sopenharmony_ci * Devices that support extended data require the entire hive to initialize with 69162306a36Sopenharmony_ci * the shared memory buffer flag set. 69262306a36Sopenharmony_ci * 69362306a36Sopenharmony_ci * Hive locks and conditions apply - see amdgpu_xgmi_add_device 69462306a36Sopenharmony_ci */ 69562306a36Sopenharmony_cistatic int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_info *hive, 69662306a36Sopenharmony_ci bool set_extended_data) 69762306a36Sopenharmony_ci{ 69862306a36Sopenharmony_ci struct amdgpu_device *tmp_adev; 69962306a36Sopenharmony_ci int ret; 70062306a36Sopenharmony_ci 70162306a36Sopenharmony_ci list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 70262306a36Sopenharmony_ci ret = psp_xgmi_initialize(&tmp_adev->psp, set_extended_data, false); 70362306a36Sopenharmony_ci if (ret) { 70462306a36Sopenharmony_ci dev_err(tmp_adev->dev, 70562306a36Sopenharmony_ci "XGMI: Failed to initialize xgmi session for data partition %i\n", 70662306a36Sopenharmony_ci set_extended_data); 70762306a36Sopenharmony_ci return ret; 70862306a36Sopenharmony_ci } 70962306a36Sopenharmony_ci 71062306a36Sopenharmony_ci } 71162306a36Sopenharmony_ci 71262306a36Sopenharmony_ci return 0; 71362306a36Sopenharmony_ci} 71462306a36Sopenharmony_ci 71562306a36Sopenharmony_ciint amdgpu_xgmi_add_device(struct amdgpu_device *adev) 71662306a36Sopenharmony_ci{ 71762306a36Sopenharmony_ci struct psp_xgmi_topology_info *top_info; 71862306a36Sopenharmony_ci struct amdgpu_hive_info *hive; 71962306a36Sopenharmony_ci struct amdgpu_xgmi *entry; 72062306a36Sopenharmony_ci struct amdgpu_device *tmp_adev = NULL; 72162306a36Sopenharmony_ci 72262306a36Sopenharmony_ci int count = 0, ret = 0; 72362306a36Sopenharmony_ci 72462306a36Sopenharmony_ci if (!adev->gmc.xgmi.supported) 72562306a36Sopenharmony_ci return 0; 72662306a36Sopenharmony_ci 72762306a36Sopenharmony_ci if (!adev->gmc.xgmi.pending_reset && 72862306a36Sopenharmony_ci amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { 72962306a36Sopenharmony_ci ret = psp_xgmi_initialize(&adev->psp, false, true); 73062306a36Sopenharmony_ci if (ret) { 73162306a36Sopenharmony_ci dev_err(adev->dev, 73262306a36Sopenharmony_ci "XGMI: Failed to initialize xgmi session\n"); 73362306a36Sopenharmony_ci return ret; 73462306a36Sopenharmony_ci } 73562306a36Sopenharmony_ci 73662306a36Sopenharmony_ci ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id); 73762306a36Sopenharmony_ci if (ret) { 73862306a36Sopenharmony_ci dev_err(adev->dev, 73962306a36Sopenharmony_ci "XGMI: Failed to get hive id\n"); 74062306a36Sopenharmony_ci return ret; 74162306a36Sopenharmony_ci } 74262306a36Sopenharmony_ci 74362306a36Sopenharmony_ci ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id); 74462306a36Sopenharmony_ci if (ret) { 74562306a36Sopenharmony_ci dev_err(adev->dev, 74662306a36Sopenharmony_ci "XGMI: Failed to get node id\n"); 74762306a36Sopenharmony_ci return ret; 74862306a36Sopenharmony_ci } 74962306a36Sopenharmony_ci } else { 75062306a36Sopenharmony_ci adev->gmc.xgmi.hive_id = 16; 75162306a36Sopenharmony_ci adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16; 75262306a36Sopenharmony_ci } 75362306a36Sopenharmony_ci 75462306a36Sopenharmony_ci hive = amdgpu_get_xgmi_hive(adev); 75562306a36Sopenharmony_ci if (!hive) { 75662306a36Sopenharmony_ci ret = -EINVAL; 75762306a36Sopenharmony_ci dev_err(adev->dev, 75862306a36Sopenharmony_ci "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n", 75962306a36Sopenharmony_ci adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id); 76062306a36Sopenharmony_ci goto exit; 76162306a36Sopenharmony_ci } 76262306a36Sopenharmony_ci mutex_lock(&hive->hive_lock); 76362306a36Sopenharmony_ci 76462306a36Sopenharmony_ci top_info = &adev->psp.xgmi_context.top_info; 76562306a36Sopenharmony_ci 76662306a36Sopenharmony_ci list_add_tail(&adev->gmc.xgmi.head, &hive->device_list); 76762306a36Sopenharmony_ci list_for_each_entry(entry, &hive->device_list, head) 76862306a36Sopenharmony_ci top_info->nodes[count++].node_id = entry->node_id; 76962306a36Sopenharmony_ci top_info->num_nodes = count; 77062306a36Sopenharmony_ci atomic_set(&hive->number_devices, count); 77162306a36Sopenharmony_ci 77262306a36Sopenharmony_ci task_barrier_add_task(&hive->tb); 77362306a36Sopenharmony_ci 77462306a36Sopenharmony_ci if (!adev->gmc.xgmi.pending_reset && 77562306a36Sopenharmony_ci amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { 77662306a36Sopenharmony_ci list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 77762306a36Sopenharmony_ci /* update node list for other device in the hive */ 77862306a36Sopenharmony_ci if (tmp_adev != adev) { 77962306a36Sopenharmony_ci top_info = &tmp_adev->psp.xgmi_context.top_info; 78062306a36Sopenharmony_ci top_info->nodes[count - 1].node_id = 78162306a36Sopenharmony_ci adev->gmc.xgmi.node_id; 78262306a36Sopenharmony_ci top_info->num_nodes = count; 78362306a36Sopenharmony_ci } 78462306a36Sopenharmony_ci ret = amdgpu_xgmi_update_topology(hive, tmp_adev); 78562306a36Sopenharmony_ci if (ret) 78662306a36Sopenharmony_ci goto exit_unlock; 78762306a36Sopenharmony_ci } 78862306a36Sopenharmony_ci 78962306a36Sopenharmony_ci /* get latest topology info for each device from psp */ 79062306a36Sopenharmony_ci list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 79162306a36Sopenharmony_ci ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, 79262306a36Sopenharmony_ci &tmp_adev->psp.xgmi_context.top_info, false); 79362306a36Sopenharmony_ci if (ret) { 79462306a36Sopenharmony_ci dev_err(tmp_adev->dev, 79562306a36Sopenharmony_ci "XGMI: Get topology failure on device %llx, hive %llx, ret %d", 79662306a36Sopenharmony_ci tmp_adev->gmc.xgmi.node_id, 79762306a36Sopenharmony_ci tmp_adev->gmc.xgmi.hive_id, ret); 79862306a36Sopenharmony_ci /* To do : continue with some node failed or disable the whole hive */ 79962306a36Sopenharmony_ci goto exit_unlock; 80062306a36Sopenharmony_ci } 80162306a36Sopenharmony_ci } 80262306a36Sopenharmony_ci 80362306a36Sopenharmony_ci /* get topology again for hives that support extended data */ 80462306a36Sopenharmony_ci if (adev->psp.xgmi_context.supports_extended_data) { 80562306a36Sopenharmony_ci 80662306a36Sopenharmony_ci /* initialize the hive to get extended data. */ 80762306a36Sopenharmony_ci ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, true); 80862306a36Sopenharmony_ci if (ret) 80962306a36Sopenharmony_ci goto exit_unlock; 81062306a36Sopenharmony_ci 81162306a36Sopenharmony_ci /* get the extended data. */ 81262306a36Sopenharmony_ci list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 81362306a36Sopenharmony_ci ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, 81462306a36Sopenharmony_ci &tmp_adev->psp.xgmi_context.top_info, true); 81562306a36Sopenharmony_ci if (ret) { 81662306a36Sopenharmony_ci dev_err(tmp_adev->dev, 81762306a36Sopenharmony_ci "XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d", 81862306a36Sopenharmony_ci tmp_adev->gmc.xgmi.node_id, 81962306a36Sopenharmony_ci tmp_adev->gmc.xgmi.hive_id, ret); 82062306a36Sopenharmony_ci goto exit_unlock; 82162306a36Sopenharmony_ci } 82262306a36Sopenharmony_ci } 82362306a36Sopenharmony_ci 82462306a36Sopenharmony_ci /* initialize the hive to get non-extended data for the next round. */ 82562306a36Sopenharmony_ci ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, false); 82662306a36Sopenharmony_ci if (ret) 82762306a36Sopenharmony_ci goto exit_unlock; 82862306a36Sopenharmony_ci 82962306a36Sopenharmony_ci } 83062306a36Sopenharmony_ci } 83162306a36Sopenharmony_ci 83262306a36Sopenharmony_ci if (!ret && !adev->gmc.xgmi.pending_reset) 83362306a36Sopenharmony_ci ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive); 83462306a36Sopenharmony_ci 83562306a36Sopenharmony_ciexit_unlock: 83662306a36Sopenharmony_ci mutex_unlock(&hive->hive_lock); 83762306a36Sopenharmony_ciexit: 83862306a36Sopenharmony_ci if (!ret) { 83962306a36Sopenharmony_ci adev->hive = hive; 84062306a36Sopenharmony_ci dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n", 84162306a36Sopenharmony_ci adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id); 84262306a36Sopenharmony_ci } else { 84362306a36Sopenharmony_ci amdgpu_put_xgmi_hive(hive); 84462306a36Sopenharmony_ci dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n", 84562306a36Sopenharmony_ci adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id, 84662306a36Sopenharmony_ci ret); 84762306a36Sopenharmony_ci } 84862306a36Sopenharmony_ci 84962306a36Sopenharmony_ci return ret; 85062306a36Sopenharmony_ci} 85162306a36Sopenharmony_ci 85262306a36Sopenharmony_ciint amdgpu_xgmi_remove_device(struct amdgpu_device *adev) 85362306a36Sopenharmony_ci{ 85462306a36Sopenharmony_ci struct amdgpu_hive_info *hive = adev->hive; 85562306a36Sopenharmony_ci 85662306a36Sopenharmony_ci if (!adev->gmc.xgmi.supported) 85762306a36Sopenharmony_ci return -EINVAL; 85862306a36Sopenharmony_ci 85962306a36Sopenharmony_ci if (!hive) 86062306a36Sopenharmony_ci return -EINVAL; 86162306a36Sopenharmony_ci 86262306a36Sopenharmony_ci mutex_lock(&hive->hive_lock); 86362306a36Sopenharmony_ci task_barrier_rem_task(&hive->tb); 86462306a36Sopenharmony_ci amdgpu_xgmi_sysfs_rem_dev_info(adev, hive); 86562306a36Sopenharmony_ci if (hive->hi_req_gpu == adev) 86662306a36Sopenharmony_ci hive->hi_req_gpu = NULL; 86762306a36Sopenharmony_ci list_del(&adev->gmc.xgmi.head); 86862306a36Sopenharmony_ci mutex_unlock(&hive->hive_lock); 86962306a36Sopenharmony_ci 87062306a36Sopenharmony_ci amdgpu_put_xgmi_hive(hive); 87162306a36Sopenharmony_ci adev->hive = NULL; 87262306a36Sopenharmony_ci 87362306a36Sopenharmony_ci if (atomic_dec_return(&hive->number_devices) == 0) { 87462306a36Sopenharmony_ci /* Remove the hive from global hive list */ 87562306a36Sopenharmony_ci mutex_lock(&xgmi_mutex); 87662306a36Sopenharmony_ci list_del(&hive->node); 87762306a36Sopenharmony_ci mutex_unlock(&xgmi_mutex); 87862306a36Sopenharmony_ci 87962306a36Sopenharmony_ci amdgpu_put_xgmi_hive(hive); 88062306a36Sopenharmony_ci } 88162306a36Sopenharmony_ci 88262306a36Sopenharmony_ci return 0; 88362306a36Sopenharmony_ci} 88462306a36Sopenharmony_ci 88562306a36Sopenharmony_cistatic int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) 88662306a36Sopenharmony_ci{ 88762306a36Sopenharmony_ci if (!adev->gmc.xgmi.supported || 88862306a36Sopenharmony_ci adev->gmc.xgmi.num_physical_nodes == 0) 88962306a36Sopenharmony_ci return 0; 89062306a36Sopenharmony_ci 89162306a36Sopenharmony_ci adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev); 89262306a36Sopenharmony_ci 89362306a36Sopenharmony_ci return amdgpu_ras_block_late_init(adev, ras_block); 89462306a36Sopenharmony_ci} 89562306a36Sopenharmony_ci 89662306a36Sopenharmony_ciuint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, 89762306a36Sopenharmony_ci uint64_t addr) 89862306a36Sopenharmony_ci{ 89962306a36Sopenharmony_ci struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi; 90062306a36Sopenharmony_ci return (addr + xgmi->physical_node_id * xgmi->node_segment_size); 90162306a36Sopenharmony_ci} 90262306a36Sopenharmony_ci 90362306a36Sopenharmony_cistatic void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg) 90462306a36Sopenharmony_ci{ 90562306a36Sopenharmony_ci WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF); 90662306a36Sopenharmony_ci WREG32_PCIE(pcs_status_reg, 0); 90762306a36Sopenharmony_ci} 90862306a36Sopenharmony_ci 90962306a36Sopenharmony_cistatic void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev) 91062306a36Sopenharmony_ci{ 91162306a36Sopenharmony_ci uint32_t i; 91262306a36Sopenharmony_ci 91362306a36Sopenharmony_ci switch (adev->asic_type) { 91462306a36Sopenharmony_ci case CHIP_ARCTURUS: 91562306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) 91662306a36Sopenharmony_ci pcs_clear_status(adev, 91762306a36Sopenharmony_ci xgmi_pcs_err_status_reg_arct[i]); 91862306a36Sopenharmony_ci break; 91962306a36Sopenharmony_ci case CHIP_VEGA20: 92062306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) 92162306a36Sopenharmony_ci pcs_clear_status(adev, 92262306a36Sopenharmony_ci xgmi_pcs_err_status_reg_vg20[i]); 92362306a36Sopenharmony_ci break; 92462306a36Sopenharmony_ci case CHIP_ALDEBARAN: 92562306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) 92662306a36Sopenharmony_ci pcs_clear_status(adev, 92762306a36Sopenharmony_ci xgmi3x16_pcs_err_status_reg_aldebaran[i]); 92862306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) 92962306a36Sopenharmony_ci pcs_clear_status(adev, 93062306a36Sopenharmony_ci walf_pcs_err_status_reg_aldebaran[i]); 93162306a36Sopenharmony_ci break; 93262306a36Sopenharmony_ci default: 93362306a36Sopenharmony_ci break; 93462306a36Sopenharmony_ci } 93562306a36Sopenharmony_ci} 93662306a36Sopenharmony_ci 93762306a36Sopenharmony_cistatic int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, 93862306a36Sopenharmony_ci uint32_t value, 93962306a36Sopenharmony_ci uint32_t mask_value, 94062306a36Sopenharmony_ci uint32_t *ue_count, 94162306a36Sopenharmony_ci uint32_t *ce_count, 94262306a36Sopenharmony_ci bool is_xgmi_pcs, 94362306a36Sopenharmony_ci bool check_mask) 94462306a36Sopenharmony_ci{ 94562306a36Sopenharmony_ci int i; 94662306a36Sopenharmony_ci int ue_cnt = 0; 94762306a36Sopenharmony_ci const struct amdgpu_pcs_ras_field *pcs_ras_fields = NULL; 94862306a36Sopenharmony_ci uint32_t field_array_size = 0; 94962306a36Sopenharmony_ci 95062306a36Sopenharmony_ci if (is_xgmi_pcs) { 95162306a36Sopenharmony_ci if (adev->ip_versions[XGMI_HWIP][0] == IP_VERSION(6, 1, 0)) { 95262306a36Sopenharmony_ci pcs_ras_fields = &xgmi3x16_pcs_ras_fields[0]; 95362306a36Sopenharmony_ci field_array_size = ARRAY_SIZE(xgmi3x16_pcs_ras_fields); 95462306a36Sopenharmony_ci } else { 95562306a36Sopenharmony_ci pcs_ras_fields = &xgmi_pcs_ras_fields[0]; 95662306a36Sopenharmony_ci field_array_size = ARRAY_SIZE(xgmi_pcs_ras_fields); 95762306a36Sopenharmony_ci } 95862306a36Sopenharmony_ci } else { 95962306a36Sopenharmony_ci pcs_ras_fields = &wafl_pcs_ras_fields[0]; 96062306a36Sopenharmony_ci field_array_size = ARRAY_SIZE(wafl_pcs_ras_fields); 96162306a36Sopenharmony_ci } 96262306a36Sopenharmony_ci 96362306a36Sopenharmony_ci if (check_mask) 96462306a36Sopenharmony_ci value = value & ~mask_value; 96562306a36Sopenharmony_ci 96662306a36Sopenharmony_ci /* query xgmi/walf pcs error status, 96762306a36Sopenharmony_ci * only ue is supported */ 96862306a36Sopenharmony_ci for (i = 0; value && i < field_array_size; i++) { 96962306a36Sopenharmony_ci ue_cnt = (value & 97062306a36Sopenharmony_ci pcs_ras_fields[i].pcs_err_mask) >> 97162306a36Sopenharmony_ci pcs_ras_fields[i].pcs_err_shift; 97262306a36Sopenharmony_ci if (ue_cnt) { 97362306a36Sopenharmony_ci dev_info(adev->dev, "%s detected\n", 97462306a36Sopenharmony_ci pcs_ras_fields[i].err_name); 97562306a36Sopenharmony_ci *ue_count += ue_cnt; 97662306a36Sopenharmony_ci } 97762306a36Sopenharmony_ci 97862306a36Sopenharmony_ci /* reset bit value if the bit is checked */ 97962306a36Sopenharmony_ci value &= ~(pcs_ras_fields[i].pcs_err_mask); 98062306a36Sopenharmony_ci } 98162306a36Sopenharmony_ci 98262306a36Sopenharmony_ci return 0; 98362306a36Sopenharmony_ci} 98462306a36Sopenharmony_ci 98562306a36Sopenharmony_cistatic void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, 98662306a36Sopenharmony_ci void *ras_error_status) 98762306a36Sopenharmony_ci{ 98862306a36Sopenharmony_ci struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; 98962306a36Sopenharmony_ci int i; 99062306a36Sopenharmony_ci uint32_t data, mask_data = 0; 99162306a36Sopenharmony_ci uint32_t ue_cnt = 0, ce_cnt = 0; 99262306a36Sopenharmony_ci 99362306a36Sopenharmony_ci if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL)) 99462306a36Sopenharmony_ci return ; 99562306a36Sopenharmony_ci 99662306a36Sopenharmony_ci err_data->ue_count = 0; 99762306a36Sopenharmony_ci err_data->ce_count = 0; 99862306a36Sopenharmony_ci 99962306a36Sopenharmony_ci switch (adev->asic_type) { 100062306a36Sopenharmony_ci case CHIP_ARCTURUS: 100162306a36Sopenharmony_ci /* check xgmi pcs error */ 100262306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) { 100362306a36Sopenharmony_ci data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]); 100462306a36Sopenharmony_ci if (data) 100562306a36Sopenharmony_ci amdgpu_xgmi_query_pcs_error_status(adev, data, 100662306a36Sopenharmony_ci mask_data, &ue_cnt, &ce_cnt, true, false); 100762306a36Sopenharmony_ci } 100862306a36Sopenharmony_ci /* check wafl pcs error */ 100962306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) { 101062306a36Sopenharmony_ci data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]); 101162306a36Sopenharmony_ci if (data) 101262306a36Sopenharmony_ci amdgpu_xgmi_query_pcs_error_status(adev, data, 101362306a36Sopenharmony_ci mask_data, &ue_cnt, &ce_cnt, false, false); 101462306a36Sopenharmony_ci } 101562306a36Sopenharmony_ci break; 101662306a36Sopenharmony_ci case CHIP_VEGA20: 101762306a36Sopenharmony_ci /* check xgmi pcs error */ 101862306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) { 101962306a36Sopenharmony_ci data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]); 102062306a36Sopenharmony_ci if (data) 102162306a36Sopenharmony_ci amdgpu_xgmi_query_pcs_error_status(adev, data, 102262306a36Sopenharmony_ci mask_data, &ue_cnt, &ce_cnt, true, false); 102362306a36Sopenharmony_ci } 102462306a36Sopenharmony_ci /* check wafl pcs error */ 102562306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) { 102662306a36Sopenharmony_ci data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]); 102762306a36Sopenharmony_ci if (data) 102862306a36Sopenharmony_ci amdgpu_xgmi_query_pcs_error_status(adev, data, 102962306a36Sopenharmony_ci mask_data, &ue_cnt, &ce_cnt, false, false); 103062306a36Sopenharmony_ci } 103162306a36Sopenharmony_ci break; 103262306a36Sopenharmony_ci case CHIP_ALDEBARAN: 103362306a36Sopenharmony_ci /* check xgmi3x16 pcs error */ 103462306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) { 103562306a36Sopenharmony_ci data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]); 103662306a36Sopenharmony_ci mask_data = 103762306a36Sopenharmony_ci RREG32_PCIE(xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[i]); 103862306a36Sopenharmony_ci if (data) 103962306a36Sopenharmony_ci amdgpu_xgmi_query_pcs_error_status(adev, data, 104062306a36Sopenharmony_ci mask_data, &ue_cnt, &ce_cnt, true, true); 104162306a36Sopenharmony_ci } 104262306a36Sopenharmony_ci /* check wafl pcs error */ 104362306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) { 104462306a36Sopenharmony_ci data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]); 104562306a36Sopenharmony_ci mask_data = 104662306a36Sopenharmony_ci RREG32_PCIE(walf_pcs_err_noncorrectable_mask_reg_aldebaran[i]); 104762306a36Sopenharmony_ci if (data) 104862306a36Sopenharmony_ci amdgpu_xgmi_query_pcs_error_status(adev, data, 104962306a36Sopenharmony_ci mask_data, &ue_cnt, &ce_cnt, false, true); 105062306a36Sopenharmony_ci } 105162306a36Sopenharmony_ci break; 105262306a36Sopenharmony_ci default: 105362306a36Sopenharmony_ci dev_warn(adev->dev, "XGMI RAS error query not supported"); 105462306a36Sopenharmony_ci break; 105562306a36Sopenharmony_ci } 105662306a36Sopenharmony_ci 105762306a36Sopenharmony_ci adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev); 105862306a36Sopenharmony_ci 105962306a36Sopenharmony_ci err_data->ue_count += ue_cnt; 106062306a36Sopenharmony_ci err_data->ce_count += ce_cnt; 106162306a36Sopenharmony_ci} 106262306a36Sopenharmony_ci 106362306a36Sopenharmony_ci/* Trigger XGMI/WAFL error */ 106462306a36Sopenharmony_cistatic int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, 106562306a36Sopenharmony_ci void *inject_if, uint32_t instance_mask) 106662306a36Sopenharmony_ci{ 106762306a36Sopenharmony_ci int ret = 0; 106862306a36Sopenharmony_ci struct ta_ras_trigger_error_input *block_info = 106962306a36Sopenharmony_ci (struct ta_ras_trigger_error_input *)inject_if; 107062306a36Sopenharmony_ci 107162306a36Sopenharmony_ci if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 107262306a36Sopenharmony_ci dev_warn(adev->dev, "Failed to disallow df cstate"); 107362306a36Sopenharmony_ci 107462306a36Sopenharmony_ci if (amdgpu_dpm_allow_xgmi_power_down(adev, false)) 107562306a36Sopenharmony_ci dev_warn(adev->dev, "Failed to disallow XGMI power down"); 107662306a36Sopenharmony_ci 107762306a36Sopenharmony_ci ret = psp_ras_trigger_error(&adev->psp, block_info, instance_mask); 107862306a36Sopenharmony_ci 107962306a36Sopenharmony_ci if (amdgpu_ras_intr_triggered()) 108062306a36Sopenharmony_ci return ret; 108162306a36Sopenharmony_ci 108262306a36Sopenharmony_ci if (amdgpu_dpm_allow_xgmi_power_down(adev, true)) 108362306a36Sopenharmony_ci dev_warn(adev->dev, "Failed to allow XGMI power down"); 108462306a36Sopenharmony_ci 108562306a36Sopenharmony_ci if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) 108662306a36Sopenharmony_ci dev_warn(adev->dev, "Failed to allow df cstate"); 108762306a36Sopenharmony_ci 108862306a36Sopenharmony_ci return ret; 108962306a36Sopenharmony_ci} 109062306a36Sopenharmony_ci 109162306a36Sopenharmony_cistruct amdgpu_ras_block_hw_ops xgmi_ras_hw_ops = { 109262306a36Sopenharmony_ci .query_ras_error_count = amdgpu_xgmi_query_ras_error_count, 109362306a36Sopenharmony_ci .reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count, 109462306a36Sopenharmony_ci .ras_error_inject = amdgpu_ras_error_inject_xgmi, 109562306a36Sopenharmony_ci}; 109662306a36Sopenharmony_ci 109762306a36Sopenharmony_cistruct amdgpu_xgmi_ras xgmi_ras = { 109862306a36Sopenharmony_ci .ras_block = { 109962306a36Sopenharmony_ci .hw_ops = &xgmi_ras_hw_ops, 110062306a36Sopenharmony_ci .ras_late_init = amdgpu_xgmi_ras_late_init, 110162306a36Sopenharmony_ci }, 110262306a36Sopenharmony_ci}; 110362306a36Sopenharmony_ci 110462306a36Sopenharmony_ciint amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev) 110562306a36Sopenharmony_ci{ 110662306a36Sopenharmony_ci int err; 110762306a36Sopenharmony_ci struct amdgpu_xgmi_ras *ras; 110862306a36Sopenharmony_ci 110962306a36Sopenharmony_ci if (!adev->gmc.xgmi.ras) 111062306a36Sopenharmony_ci return 0; 111162306a36Sopenharmony_ci 111262306a36Sopenharmony_ci ras = adev->gmc.xgmi.ras; 111362306a36Sopenharmony_ci err = amdgpu_ras_register_ras_block(adev, &ras->ras_block); 111462306a36Sopenharmony_ci if (err) { 111562306a36Sopenharmony_ci dev_err(adev->dev, "Failed to register xgmi_wafl_pcs ras block!\n"); 111662306a36Sopenharmony_ci return err; 111762306a36Sopenharmony_ci } 111862306a36Sopenharmony_ci 111962306a36Sopenharmony_ci strcpy(ras->ras_block.ras_comm.name, "xgmi_wafl"); 112062306a36Sopenharmony_ci ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__XGMI_WAFL; 112162306a36Sopenharmony_ci ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; 112262306a36Sopenharmony_ci adev->gmc.xgmi.ras_if = &ras->ras_block.ras_comm; 112362306a36Sopenharmony_ci 112462306a36Sopenharmony_ci return 0; 112562306a36Sopenharmony_ci} 1126