162306a36Sopenharmony_ci/* 262306a36Sopenharmony_ci * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. 362306a36Sopenharmony_ci * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * This software is available to you under a choice of one of two 662306a36Sopenharmony_ci * licenses. You may choose to be licensed under the terms of the GNU 762306a36Sopenharmony_ci * General Public License (GPL) Version 2, available from the file 862306a36Sopenharmony_ci * COPYING in the main directory of this source tree, or the 962306a36Sopenharmony_ci * OpenIB.org BSD license below: 1062306a36Sopenharmony_ci * 1162306a36Sopenharmony_ci * Redistribution and use in source and binary forms, with or 1262306a36Sopenharmony_ci * without modification, are permitted provided that the following 1362306a36Sopenharmony_ci * conditions are met: 1462306a36Sopenharmony_ci * 1562306a36Sopenharmony_ci * - Redistributions of source code must retain the above 1662306a36Sopenharmony_ci * copyright notice, this list of conditions and the following 1762306a36Sopenharmony_ci * disclaimer. 1862306a36Sopenharmony_ci * 1962306a36Sopenharmony_ci * - Redistributions in binary form must reproduce the above 2062306a36Sopenharmony_ci * copyright notice, this list of conditions and the following 2162306a36Sopenharmony_ci * disclaimer in the documentation and/or other materials 2262306a36Sopenharmony_ci * provided with the distribution. 2362306a36Sopenharmony_ci * 2462306a36Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 2562306a36Sopenharmony_ci * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 2662306a36Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 2762306a36Sopenharmony_ci * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 2862306a36Sopenharmony_ci * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 2962306a36Sopenharmony_ci * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 3062306a36Sopenharmony_ci * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 3162306a36Sopenharmony_ci * SOFTWARE. 3262306a36Sopenharmony_ci */ 3362306a36Sopenharmony_ci 3462306a36Sopenharmony_ci#include <linux/workqueue.h> 3562306a36Sopenharmony_ci#include <linux/module.h> 3662306a36Sopenharmony_ci 3762306a36Sopenharmony_ci#include "mlx4.h" 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_cienum { 4062306a36Sopenharmony_ci MLX4_CATAS_POLL_INTERVAL = 5 * HZ, 4162306a36Sopenharmony_ci}; 4262306a36Sopenharmony_ci 4362306a36Sopenharmony_ci 4462306a36Sopenharmony_ci 4562306a36Sopenharmony_ciint mlx4_internal_err_reset = 1; 4662306a36Sopenharmony_cimodule_param_named(internal_err_reset, mlx4_internal_err_reset, int, 0644); 4762306a36Sopenharmony_ciMODULE_PARM_DESC(internal_err_reset, 4862306a36Sopenharmony_ci "Reset device on internal errors if non-zero (default 1)"); 4962306a36Sopenharmony_ci 5062306a36Sopenharmony_cistatic int read_vendor_id(struct mlx4_dev *dev) 5162306a36Sopenharmony_ci{ 5262306a36Sopenharmony_ci u16 vendor_id = 0; 5362306a36Sopenharmony_ci int ret; 5462306a36Sopenharmony_ci 5562306a36Sopenharmony_ci ret = pci_read_config_word(dev->persist->pdev, 0, &vendor_id); 5662306a36Sopenharmony_ci if (ret) { 5762306a36Sopenharmony_ci mlx4_err(dev, "Failed to read vendor ID, ret=%d\n", ret); 5862306a36Sopenharmony_ci return ret; 5962306a36Sopenharmony_ci } 6062306a36Sopenharmony_ci 6162306a36Sopenharmony_ci if (vendor_id == 0xffff) { 6262306a36Sopenharmony_ci mlx4_err(dev, "PCI can't be accessed to read vendor id\n"); 6362306a36Sopenharmony_ci return -EINVAL; 6462306a36Sopenharmony_ci } 6562306a36Sopenharmony_ci 6662306a36Sopenharmony_ci return 0; 6762306a36Sopenharmony_ci} 6862306a36Sopenharmony_ci 6962306a36Sopenharmony_cistatic int mlx4_reset_master(struct mlx4_dev *dev) 7062306a36Sopenharmony_ci{ 7162306a36Sopenharmony_ci int err = 0; 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ci if (mlx4_is_master(dev)) 7462306a36Sopenharmony_ci mlx4_report_internal_err_comm_event(dev); 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_ci if (!pci_channel_offline(dev->persist->pdev)) { 7762306a36Sopenharmony_ci err = read_vendor_id(dev); 7862306a36Sopenharmony_ci /* If PCI can't be accessed to read vendor ID we assume that its 7962306a36Sopenharmony_ci * link was disabled and chip was already reset. 8062306a36Sopenharmony_ci */ 8162306a36Sopenharmony_ci if (err) 8262306a36Sopenharmony_ci return 0; 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci err = mlx4_reset(dev); 8562306a36Sopenharmony_ci if (err) 8662306a36Sopenharmony_ci mlx4_err(dev, "Fail to reset HCA\n"); 8762306a36Sopenharmony_ci } 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_ci return err; 9062306a36Sopenharmony_ci} 9162306a36Sopenharmony_ci 9262306a36Sopenharmony_cistatic int mlx4_reset_slave(struct mlx4_dev *dev) 9362306a36Sopenharmony_ci{ 9462306a36Sopenharmony_ci#define COM_CHAN_RST_REQ_OFFSET 0x10 9562306a36Sopenharmony_ci#define COM_CHAN_RST_ACK_OFFSET 0x08 9662306a36Sopenharmony_ci 9762306a36Sopenharmony_ci u32 comm_flags; 9862306a36Sopenharmony_ci u32 rst_req; 9962306a36Sopenharmony_ci u32 rst_ack; 10062306a36Sopenharmony_ci unsigned long end; 10162306a36Sopenharmony_ci struct mlx4_priv *priv = mlx4_priv(dev); 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_ci if (pci_channel_offline(dev->persist->pdev)) 10462306a36Sopenharmony_ci return 0; 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm + 10762306a36Sopenharmony_ci MLX4_COMM_CHAN_FLAGS)); 10862306a36Sopenharmony_ci if (comm_flags == 0xffffffff) { 10962306a36Sopenharmony_ci mlx4_err(dev, "VF reset is not needed\n"); 11062306a36Sopenharmony_ci return 0; 11162306a36Sopenharmony_ci } 11262306a36Sopenharmony_ci 11362306a36Sopenharmony_ci if (!(dev->caps.vf_caps & MLX4_VF_CAP_FLAG_RESET)) { 11462306a36Sopenharmony_ci mlx4_err(dev, "VF reset is not supported\n"); 11562306a36Sopenharmony_ci return -EOPNOTSUPP; 11662306a36Sopenharmony_ci } 11762306a36Sopenharmony_ci 11862306a36Sopenharmony_ci rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >> 11962306a36Sopenharmony_ci COM_CHAN_RST_REQ_OFFSET; 12062306a36Sopenharmony_ci rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >> 12162306a36Sopenharmony_ci COM_CHAN_RST_ACK_OFFSET; 12262306a36Sopenharmony_ci if (rst_req != rst_ack) { 12362306a36Sopenharmony_ci mlx4_err(dev, "Communication channel isn't sync, fail to send reset\n"); 12462306a36Sopenharmony_ci return -EIO; 12562306a36Sopenharmony_ci } 12662306a36Sopenharmony_ci 12762306a36Sopenharmony_ci rst_req ^= 1; 12862306a36Sopenharmony_ci mlx4_warn(dev, "VF is sending reset request to Firmware\n"); 12962306a36Sopenharmony_ci comm_flags = rst_req << COM_CHAN_RST_REQ_OFFSET; 13062306a36Sopenharmony_ci __raw_writel((__force u32)cpu_to_be32(comm_flags), 13162306a36Sopenharmony_ci (__iomem char *)priv->mfunc.comm + MLX4_COMM_CHAN_FLAGS); 13262306a36Sopenharmony_ci 13362306a36Sopenharmony_ci end = msecs_to_jiffies(MLX4_COMM_TIME) + jiffies; 13462306a36Sopenharmony_ci while (time_before(jiffies, end)) { 13562306a36Sopenharmony_ci comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm + 13662306a36Sopenharmony_ci MLX4_COMM_CHAN_FLAGS)); 13762306a36Sopenharmony_ci rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >> 13862306a36Sopenharmony_ci COM_CHAN_RST_ACK_OFFSET; 13962306a36Sopenharmony_ci 14062306a36Sopenharmony_ci /* Reading rst_req again since the communication channel can 14162306a36Sopenharmony_ci * be reset at any time by the PF and all its bits will be 14262306a36Sopenharmony_ci * set to zero. 14362306a36Sopenharmony_ci */ 14462306a36Sopenharmony_ci rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >> 14562306a36Sopenharmony_ci COM_CHAN_RST_REQ_OFFSET; 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci if (rst_ack == rst_req) { 14862306a36Sopenharmony_ci mlx4_warn(dev, "VF Reset succeed\n"); 14962306a36Sopenharmony_ci return 0; 15062306a36Sopenharmony_ci } 15162306a36Sopenharmony_ci cond_resched(); 15262306a36Sopenharmony_ci } 15362306a36Sopenharmony_ci mlx4_err(dev, "Fail to send reset over the communication channel\n"); 15462306a36Sopenharmony_ci return -ETIMEDOUT; 15562306a36Sopenharmony_ci} 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_ciint mlx4_comm_internal_err(u32 slave_read) 15862306a36Sopenharmony_ci{ 15962306a36Sopenharmony_ci return (u32)COMM_CHAN_EVENT_INTERNAL_ERR == 16062306a36Sopenharmony_ci (slave_read & (u32)COMM_CHAN_EVENT_INTERNAL_ERR) ? 1 : 0; 16162306a36Sopenharmony_ci} 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_civoid mlx4_enter_error_state(struct mlx4_dev_persistent *persist) 16462306a36Sopenharmony_ci{ 16562306a36Sopenharmony_ci int err; 16662306a36Sopenharmony_ci struct mlx4_dev *dev; 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_ci if (!mlx4_internal_err_reset) 16962306a36Sopenharmony_ci return; 17062306a36Sopenharmony_ci 17162306a36Sopenharmony_ci mutex_lock(&persist->device_state_mutex); 17262306a36Sopenharmony_ci if (persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) 17362306a36Sopenharmony_ci goto out; 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_ci dev = persist->dev; 17662306a36Sopenharmony_ci mlx4_err(dev, "device is going to be reset\n"); 17762306a36Sopenharmony_ci if (mlx4_is_slave(dev)) { 17862306a36Sopenharmony_ci err = mlx4_reset_slave(dev); 17962306a36Sopenharmony_ci } else { 18062306a36Sopenharmony_ci mlx4_crdump_collect(dev); 18162306a36Sopenharmony_ci err = mlx4_reset_master(dev); 18262306a36Sopenharmony_ci } 18362306a36Sopenharmony_ci 18462306a36Sopenharmony_ci if (!err) { 18562306a36Sopenharmony_ci mlx4_err(dev, "device was reset successfully\n"); 18662306a36Sopenharmony_ci } else { 18762306a36Sopenharmony_ci /* EEH could have disabled the PCI channel during reset. That's 18862306a36Sopenharmony_ci * recoverable and the PCI error flow will handle it. 18962306a36Sopenharmony_ci */ 19062306a36Sopenharmony_ci if (!pci_channel_offline(dev->persist->pdev)) 19162306a36Sopenharmony_ci BUG_ON(1); 19262306a36Sopenharmony_ci } 19362306a36Sopenharmony_ci dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR; 19462306a36Sopenharmony_ci mutex_unlock(&persist->device_state_mutex); 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_ci /* At that step HW was already reset, now notify clients */ 19762306a36Sopenharmony_ci mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, NULL); 19862306a36Sopenharmony_ci mlx4_cmd_wake_completions(dev); 19962306a36Sopenharmony_ci return; 20062306a36Sopenharmony_ci 20162306a36Sopenharmony_ciout: 20262306a36Sopenharmony_ci mutex_unlock(&persist->device_state_mutex); 20362306a36Sopenharmony_ci} 20462306a36Sopenharmony_ci 20562306a36Sopenharmony_cistatic void mlx4_handle_error_state(struct mlx4_dev_persistent *persist) 20662306a36Sopenharmony_ci{ 20762306a36Sopenharmony_ci struct mlx4_dev *dev = persist->dev; 20862306a36Sopenharmony_ci struct devlink *devlink; 20962306a36Sopenharmony_ci int err = 0; 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci mlx4_enter_error_state(persist); 21262306a36Sopenharmony_ci devlink = priv_to_devlink(mlx4_priv(dev)); 21362306a36Sopenharmony_ci devl_lock(devlink); 21462306a36Sopenharmony_ci mutex_lock(&persist->interface_state_mutex); 21562306a36Sopenharmony_ci if (persist->interface_state & MLX4_INTERFACE_STATE_UP && 21662306a36Sopenharmony_ci !(persist->interface_state & MLX4_INTERFACE_STATE_DELETION)) { 21762306a36Sopenharmony_ci err = mlx4_restart_one(persist->pdev); 21862306a36Sopenharmony_ci mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n", 21962306a36Sopenharmony_ci err); 22062306a36Sopenharmony_ci } 22162306a36Sopenharmony_ci mutex_unlock(&persist->interface_state_mutex); 22262306a36Sopenharmony_ci devl_unlock(devlink); 22362306a36Sopenharmony_ci} 22462306a36Sopenharmony_ci 22562306a36Sopenharmony_cistatic void dump_err_buf(struct mlx4_dev *dev) 22662306a36Sopenharmony_ci{ 22762306a36Sopenharmony_ci struct mlx4_priv *priv = mlx4_priv(dev); 22862306a36Sopenharmony_ci 22962306a36Sopenharmony_ci int i; 23062306a36Sopenharmony_ci 23162306a36Sopenharmony_ci mlx4_err(dev, "Internal error detected:\n"); 23262306a36Sopenharmony_ci for (i = 0; i < priv->fw.catas_size; ++i) 23362306a36Sopenharmony_ci mlx4_err(dev, " buf[%02x]: %08x\n", 23462306a36Sopenharmony_ci i, swab32(readl(priv->catas_err.map + i))); 23562306a36Sopenharmony_ci} 23662306a36Sopenharmony_ci 23762306a36Sopenharmony_cistatic void poll_catas(struct timer_list *t) 23862306a36Sopenharmony_ci{ 23962306a36Sopenharmony_ci struct mlx4_priv *priv = from_timer(priv, t, catas_err.timer); 24062306a36Sopenharmony_ci struct mlx4_dev *dev = &priv->dev; 24162306a36Sopenharmony_ci u32 slave_read; 24262306a36Sopenharmony_ci 24362306a36Sopenharmony_ci if (mlx4_is_slave(dev)) { 24462306a36Sopenharmony_ci slave_read = swab32(readl(&priv->mfunc.comm->slave_read)); 24562306a36Sopenharmony_ci if (mlx4_comm_internal_err(slave_read)) { 24662306a36Sopenharmony_ci mlx4_warn(dev, "Internal error detected on the communication channel\n"); 24762306a36Sopenharmony_ci goto internal_err; 24862306a36Sopenharmony_ci } 24962306a36Sopenharmony_ci } else if (readl(priv->catas_err.map)) { 25062306a36Sopenharmony_ci dump_err_buf(dev); 25162306a36Sopenharmony_ci goto internal_err; 25262306a36Sopenharmony_ci } 25362306a36Sopenharmony_ci 25462306a36Sopenharmony_ci if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { 25562306a36Sopenharmony_ci mlx4_warn(dev, "Internal error mark was detected on device\n"); 25662306a36Sopenharmony_ci goto internal_err; 25762306a36Sopenharmony_ci } 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci mod_timer(&priv->catas_err.timer, 26062306a36Sopenharmony_ci round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL)); 26162306a36Sopenharmony_ci return; 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ciinternal_err: 26462306a36Sopenharmony_ci if (mlx4_internal_err_reset) 26562306a36Sopenharmony_ci queue_work(dev->persist->catas_wq, &dev->persist->catas_work); 26662306a36Sopenharmony_ci} 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_cistatic void catas_reset(struct work_struct *work) 26962306a36Sopenharmony_ci{ 27062306a36Sopenharmony_ci struct mlx4_dev_persistent *persist = 27162306a36Sopenharmony_ci container_of(work, struct mlx4_dev_persistent, 27262306a36Sopenharmony_ci catas_work); 27362306a36Sopenharmony_ci 27462306a36Sopenharmony_ci mlx4_handle_error_state(persist); 27562306a36Sopenharmony_ci} 27662306a36Sopenharmony_ci 27762306a36Sopenharmony_civoid mlx4_start_catas_poll(struct mlx4_dev *dev) 27862306a36Sopenharmony_ci{ 27962306a36Sopenharmony_ci struct mlx4_priv *priv = mlx4_priv(dev); 28062306a36Sopenharmony_ci phys_addr_t addr; 28162306a36Sopenharmony_ci 28262306a36Sopenharmony_ci INIT_LIST_HEAD(&priv->catas_err.list); 28362306a36Sopenharmony_ci timer_setup(&priv->catas_err.timer, poll_catas, 0); 28462306a36Sopenharmony_ci priv->catas_err.map = NULL; 28562306a36Sopenharmony_ci 28662306a36Sopenharmony_ci if (!mlx4_is_slave(dev)) { 28762306a36Sopenharmony_ci addr = pci_resource_start(dev->persist->pdev, 28862306a36Sopenharmony_ci priv->fw.catas_bar) + 28962306a36Sopenharmony_ci priv->fw.catas_offset; 29062306a36Sopenharmony_ci 29162306a36Sopenharmony_ci priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4); 29262306a36Sopenharmony_ci if (!priv->catas_err.map) { 29362306a36Sopenharmony_ci mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n", 29462306a36Sopenharmony_ci (unsigned long long)addr); 29562306a36Sopenharmony_ci return; 29662306a36Sopenharmony_ci } 29762306a36Sopenharmony_ci } 29862306a36Sopenharmony_ci 29962306a36Sopenharmony_ci priv->catas_err.timer.expires = 30062306a36Sopenharmony_ci round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL); 30162306a36Sopenharmony_ci add_timer(&priv->catas_err.timer); 30262306a36Sopenharmony_ci} 30362306a36Sopenharmony_ci 30462306a36Sopenharmony_civoid mlx4_stop_catas_poll(struct mlx4_dev *dev) 30562306a36Sopenharmony_ci{ 30662306a36Sopenharmony_ci struct mlx4_priv *priv = mlx4_priv(dev); 30762306a36Sopenharmony_ci 30862306a36Sopenharmony_ci del_timer_sync(&priv->catas_err.timer); 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_ci if (priv->catas_err.map) { 31162306a36Sopenharmony_ci iounmap(priv->catas_err.map); 31262306a36Sopenharmony_ci priv->catas_err.map = NULL; 31362306a36Sopenharmony_ci } 31462306a36Sopenharmony_ci 31562306a36Sopenharmony_ci if (dev->persist->interface_state & MLX4_INTERFACE_STATE_DELETION) 31662306a36Sopenharmony_ci flush_workqueue(dev->persist->catas_wq); 31762306a36Sopenharmony_ci} 31862306a36Sopenharmony_ci 31962306a36Sopenharmony_ciint mlx4_catas_init(struct mlx4_dev *dev) 32062306a36Sopenharmony_ci{ 32162306a36Sopenharmony_ci INIT_WORK(&dev->persist->catas_work, catas_reset); 32262306a36Sopenharmony_ci dev->persist->catas_wq = create_singlethread_workqueue("mlx4_health"); 32362306a36Sopenharmony_ci if (!dev->persist->catas_wq) 32462306a36Sopenharmony_ci return -ENOMEM; 32562306a36Sopenharmony_ci 32662306a36Sopenharmony_ci return 0; 32762306a36Sopenharmony_ci} 32862306a36Sopenharmony_ci 32962306a36Sopenharmony_civoid mlx4_catas_end(struct mlx4_dev *dev) 33062306a36Sopenharmony_ci{ 33162306a36Sopenharmony_ci if (dev->persist->catas_wq) { 33262306a36Sopenharmony_ci destroy_workqueue(dev->persist->catas_wq); 33362306a36Sopenharmony_ci dev->persist->catas_wq = NULL; 33462306a36Sopenharmony_ci } 33562306a36Sopenharmony_ci} 336